Repository: hankcs/HanLP
Branch: doc-zh
Commit: ddb1299bddff
Files: 697
Total size: 3.2 MB

Directory structure:
gitextract_p7um9exn/

├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   ├── config.yml
│   │   └── feature_request.md
│   ├── pull_request_template.md
│   └── workflows/
│       └── unit-tests.yml
├── .gitignore
├── CITATION.cff
├── LICENSE
├── README.md
├── docs/
│   ├── Makefile
│   ├── annotations/
│   │   ├── constituency/
│   │   │   ├── ctb.md
│   │   │   ├── index.md
│   │   │   ├── npcmj.md
│   │   │   └── ptb.md
│   │   ├── dep/
│   │   │   ├── index.md
│   │   │   ├── pmt.md
│   │   │   ├── sd_en.md
│   │   │   ├── sd_zh.md
│   │   │   └── ud.md
│   │   ├── index.md
│   │   ├── ner/
│   │   │   ├── index.md
│   │   │   ├── msra.md
│   │   │   ├── ontonotes.md
│   │   │   └── pku.md
│   │   ├── pos/
│   │   │   ├── 863.md
│   │   │   ├── ctb.md
│   │   │   ├── index.md
│   │   │   ├── npcmj.md
│   │   │   ├── pku.md
│   │   │   └── ud.md
│   │   ├── sdp/
│   │   │   ├── dm.md
│   │   │   ├── index.md
│   │   │   ├── pas.md
│   │   │   ├── psd.md
│   │   │   └── semeval16.md
│   │   ├── srl/
│   │   │   ├── cpb.md
│   │   │   ├── index.md
│   │   │   └── propbank.md
│   │   └── tok/
│   │       ├── ctb.md
│   │       ├── index.md
│   │       └── msr.md
│   ├── api/
│   │   ├── common/
│   │   │   ├── configurable.rst
│   │   │   ├── conll.rst
│   │   │   ├── constant.rst
│   │   │   ├── document.rst
│   │   │   └── index.md
│   │   ├── hanlp/
│   │   │   ├── common/
│   │   │   │   ├── component.rst
│   │   │   │   ├── dataset.md
│   │   │   │   ├── index.md
│   │   │   │   ├── structure.md
│   │   │   │   ├── torch_component.md
│   │   │   │   ├── transform.md
│   │   │   │   └── vocab.md
│   │   │   ├── components/
│   │   │   │   ├── classifiers.md
│   │   │   │   ├── eos.md
│   │   │   │   ├── index.md
│   │   │   │   ├── lemmatizer.md
│   │   │   │   ├── mtl/
│   │   │   │   │   ├── index.md
│   │   │   │   │   ├── mtl.md
│   │   │   │   │   └── tasks/
│   │   │   │   │       ├── constituency.md
│   │   │   │   │       ├── dep.md
│   │   │   │   │       ├── index.md
│   │   │   │   │       ├── lem.md
│   │   │   │   │       ├── ner/
│   │   │   │   │       │   ├── biaffine_ner.md
│   │   │   │   │       │   ├── index.md
│   │   │   │   │       │   └── tag_ner.md
│   │   │   │   │       ├── pos.md
│   │   │   │   │       ├── sdp.md
│   │   │   │   │       ├── srl/
│   │   │   │   │       │   ├── bio_srl.md
│   │   │   │   │       │   ├── index.md
│   │   │   │   │       │   └── rank_srl.md
│   │   │   │   │       ├── task.md
│   │   │   │   │       ├── tok.md
│   │   │   │   │       └── ud.md
│   │   │   │   ├── ner/
│   │   │   │   │   ├── biaffine_ner.md
│   │   │   │   │   ├── index.md
│   │   │   │   │   ├── rnn_ner.md
│   │   │   │   │   └── transformer_ner.md
│   │   │   │   ├── parsers/
│   │   │   │   │   ├── biaffine_dep.md
│   │   │   │   │   ├── biaffine_sdp.md
│   │   │   │   │   ├── crf_constituency_parser.md
│   │   │   │   │   ├── index.md
│   │   │   │   │   └── ud_parser.md
│   │   │   │   ├── pipeline.md
│   │   │   │   ├── srl/
│   │   │   │   │   ├── index.md
│   │   │   │   │   ├── span_bio.md
│   │   │   │   │   └── span_rank.md
│   │   │   │   ├── sts.md
│   │   │   │   ├── taggers/
│   │   │   │   │   ├── index.md
│   │   │   │   │   ├── rnn_tagger.md
│   │   │   │   │   └── transformer_tagger.md
│   │   │   │   └── tokenizers/
│   │   │   │       ├── index.md
│   │   │   │       ├── multi_criteria.md
│   │   │   │       └── transformer.md
│   │   │   ├── datasets/
│   │   │   │   ├── constituency/
│   │   │   │   │   ├── constituency_dataset.md
│   │   │   │   │   ├── index.md
│   │   │   │   │   └── resources.md
│   │   │   │   ├── dep/
│   │   │   │   │   ├── conll_dataset.md
│   │   │   │   │   ├── index.md
│   │   │   │   │   └── resources.md
│   │   │   │   ├── eos/
│   │   │   │   │   ├── eos.md
│   │   │   │   │   ├── index.md
│   │   │   │   │   └── resources.md
│   │   │   │   ├── index.md
│   │   │   │   ├── ner/
│   │   │   │   │   ├── index.md
│   │   │   │   │   ├── json.md
│   │   │   │   │   ├── resources.md
│   │   │   │   │   └── tsv.md
│   │   │   │   ├── pos/
│   │   │   │   │   ├── index.md
│   │   │   │   │   └── resources.md
│   │   │   │   ├── srl/
│   │   │   │   │   ├── conll2012_dataset.md
│   │   │   │   │   ├── index.md
│   │   │   │   │   └── resources.md
│   │   │   │   └── tok/
│   │   │   │       ├── index.md
│   │   │   │       ├── mcws_dataset.md
│   │   │   │       ├── resources.md
│   │   │   │       └── txt.md
│   │   │   ├── hanlp.rst
│   │   │   ├── index.md
│   │   │   ├── layers/
│   │   │   │   ├── decoders/
│   │   │   │   │   ├── biaffine_ner.md
│   │   │   │   │   ├── index.md
│   │   │   │   │   └── linear_crf.md
│   │   │   │   ├── embeddings/
│   │   │   │   │   ├── char_cnn.md
│   │   │   │   │   ├── char_rnn.md
│   │   │   │   │   ├── embedding.md
│   │   │   │   │   ├── fasttext.md
│   │   │   │   │   ├── index.md
│   │   │   │   │   ├── transformer.md
│   │   │   │   │   └── word2vec.md
│   │   │   │   ├── index.md
│   │   │   │   └── transformers/
│   │   │   │       ├── encoder.md
│   │   │   │       ├── index.md
│   │   │   │       └── tokenizer.md
│   │   │   ├── pretrained/
│   │   │   │   ├── amr.md
│   │   │   │   ├── amr2text.md
│   │   │   │   ├── constituency.md
│   │   │   │   ├── dep.md
│   │   │   │   ├── eos.md
│   │   │   │   ├── fasttext.md
│   │   │   │   ├── glove.md
│   │   │   │   ├── index.md
│   │   │   │   ├── mlm.md
│   │   │   │   ├── mtl.md
│   │   │   │   ├── ner.md
│   │   │   │   ├── pos.md
│   │   │   │   ├── sdp.md
│   │   │   │   ├── srl.md
│   │   │   │   ├── sts.md
│   │   │   │   ├── tok.md
│   │   │   │   └── word2vec.md
│   │   │   └── utils/
│   │   │       ├── index.md
│   │   │       └── io_util.md
│   │   ├── restful.rst
│   │   ├── restful_golang.md
│   │   ├── restful_java.md
│   │   └── trie/
│   │       ├── dictionary.md
│   │       ├── index.md
│   │       └── trie.md
│   ├── conf.py
│   ├── configure.md
│   ├── contributing.md
│   ├── data_format.md
│   ├── index.md
│   ├── install.md
│   ├── references.bib
│   ├── references.rst
│   └── tutorial.md
├── hanlp/
│   ├── __init__.py
│   ├── callbacks/
│   │   ├── __init__.py
│   │   └── fine_csv_logger.py
│   ├── common/
│   │   ├── __init__.py
│   │   ├── component.py
│   │   ├── dataset.py
│   │   ├── keras_component.py
│   │   ├── structure.py
│   │   ├── torch_component.py
│   │   ├── transform.py
│   │   ├── transform_tf.py
│   │   ├── vocab.py
│   │   └── vocab_tf.py
│   ├── components/
│   │   ├── __init__.py
│   │   ├── amr/
│   │   │   ├── __init__.py
│   │   │   ├── amrbart/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── bart_amr_generation.py
│   │   │   │   ├── bart_amr_parser.py
│   │   │   │   ├── common/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── constant.py
│   │   │   │   │   ├── penman_interface.py
│   │   │   │   │   └── postprocessing.py
│   │   │   │   ├── data_interface/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── dataset.py
│   │   │   │   ├── model_interface/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── modeling_bart.py
│   │   │   │   │   └── tokenization_bart.py
│   │   │   │   └── preprocess/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── amr_io.py
│   │   │   │       ├── penman_interface.py
│   │   │   │       └── read_and_process.py
│   │   │   └── seq2seq/
│   │   │       ├── __init__.py
│   │   │       ├── dataset/
│   │   │       │   ├── IO.py
│   │   │       │   ├── __init__.py
│   │   │       │   ├── dataset.py
│   │   │       │   ├── linearization.py
│   │   │       │   ├── penman.py
│   │   │       │   ├── postprocessing.py
│   │   │       │   ├── tokenization_bart.py
│   │   │       │   └── tokenization_t5.py
│   │   │       ├── evaluation.py
│   │   │       ├── optim.py
│   │   │       └── seq2seq_amr_parser.py
│   │   ├── classifiers/
│   │   │   ├── __init__.py
│   │   │   ├── fasttext_classifier.py
│   │   │   ├── transformer_classifier.py
│   │   │   ├── transformer_classifier_hf.py
│   │   │   ├── transformer_classifier_tf.py
│   │   │   └── transformer_regression_hf.py
│   │   ├── distillation/
│   │   │   ├── __init__.py
│   │   │   ├── distillable_component.py
│   │   │   ├── losses.py
│   │   │   └── schedulers.py
│   │   ├── eos/
│   │   │   ├── __init__.py
│   │   │   └── ngram.py
│   │   ├── lambda_wrapper.py
│   │   ├── lemmatizer.py
│   │   ├── lm/
│   │   │   ├── __init__.py
│   │   │   └── mlm.py
│   │   ├── mtl/
│   │   │   ├── __init__.py
│   │   │   ├── multi_task_learning.py
│   │   │   └── tasks/
│   │   │       ├── __init__.py
│   │   │       ├── amr.py
│   │   │       ├── constituency.py
│   │   │       ├── dep.py
│   │   │       ├── dep_2nd.py
│   │   │       ├── lem.py
│   │   │       ├── ner/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── biaffine_ner.py
│   │   │       │   └── tag_ner.py
│   │   │       ├── pos.py
│   │   │       ├── sdp.py
│   │   │       ├── srl/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── bio_srl.py
│   │   │       │   └── rank_srl.py
│   │   │       ├── tok/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── reg_tok.py
│   │   │       │   └── tag_tok.py
│   │   │       └── ud.py
│   │   ├── ner/
│   │   │   ├── __init__.py
│   │   │   ├── biaffine_ner/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── biaffine_ner.py
│   │   │   │   └── biaffine_ner_model.py
│   │   │   ├── ner_tf.py
│   │   │   ├── rnn_ner.py
│   │   │   └── transformer_ner.py
│   │   ├── parsers/
│   │   │   ├── __init__.py
│   │   │   ├── alg.py
│   │   │   ├── alg_tf.py
│   │   │   ├── biaffine/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── biaffine.py
│   │   │   │   ├── biaffine_2nd_dep.py
│   │   │   │   ├── biaffine_dep.py
│   │   │   │   ├── biaffine_model.py
│   │   │   │   ├── biaffine_sdp.py
│   │   │   │   ├── mlp.py
│   │   │   │   ├── structual_attention.py
│   │   │   │   └── variationalbilstm.py
│   │   │   ├── biaffine_parser_tf.py
│   │   │   ├── biaffine_tf/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── alg.py
│   │   │   │   ├── layers.py
│   │   │   │   └── model.py
│   │   │   ├── chu_liu_edmonds.py
│   │   │   ├── conll.py
│   │   │   ├── constituency/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── crf_constituency_model.py
│   │   │   │   ├── crf_constituency_parser.py
│   │   │   │   └── treecrf.py
│   │   │   ├── parse_alg.py
│   │   │   └── ud/
│   │   │       ├── __init__.py
│   │   │       ├── lemma_edit.py
│   │   │       ├── tag_decoder.py
│   │   │       ├── ud_model.py
│   │   │       ├── ud_parser.py
│   │   │       ├── udify_util.py
│   │   │       └── util.py
│   │   ├── pipeline.py
│   │   ├── rnn_language_model_tf.py
│   │   ├── srl/
│   │   │   ├── __init__.py
│   │   │   ├── span_bio/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── baffine_tagging.py
│   │   │   │   └── span_bio.py
│   │   │   └── span_rank/
│   │   │       ├── __init__.py
│   │   │       ├── highway_variational_lstm.py
│   │   │       ├── inference_utils.py
│   │   │       ├── layer.py
│   │   │       ├── span_rank.py
│   │   │       ├── span_ranking_srl_model.py
│   │   │       ├── srl_eval_utils.py
│   │   │       └── util.py
│   │   ├── sts/
│   │   │   ├── __init__.py
│   │   │   └── transformer_sts.py
│   │   ├── taggers/
│   │   │   ├── __init__.py
│   │   │   ├── cnn_tagger_tf.py
│   │   │   ├── ngram_conv/
│   │   │   │   ├── __init__.py
│   │   │   │   └── ngram_conv_tagger.py
│   │   │   ├── pos_tf.py
│   │   │   ├── rnn/
│   │   │   │   ├── __init__.py
│   │   │   │   └── rnntaggingmodel.py
│   │   │   ├── rnn_tagger.py
│   │   │   ├── rnn_tagger_tf.py
│   │   │   ├── tagger.py
│   │   │   ├── tagger_tf.py
│   │   │   ├── transformers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── metrics_tf.py
│   │   │   │   ├── transformer_tagger.py
│   │   │   │   ├── transformer_tagger_tf.py
│   │   │   │   └── transformer_transform_tf.py
│   │   │   └── util.py
│   │   └── tokenizers/
│   │       ├── __init__.py
│   │       ├── multi_criteria_cws_transformer.py
│   │       ├── tok.py
│   │       ├── tok_tf.py
│   │       └── transformer.py
│   ├── datasets/
│   │   ├── __init__.py
│   │   ├── classification/
│   │   │   ├── __init__.py
│   │   │   └── sentiment.py
│   │   ├── coref/
│   │   │   ├── __init__.py
│   │   │   └── loaders/
│   │   │       ├── __init__.py
│   │   │       └── conll12coref.py
│   │   ├── eos/
│   │   │   ├── __init__.py
│   │   │   ├── eos.py
│   │   │   └── loaders/
│   │   │       ├── __init__.py
│   │   │       └── nn_eos.py
│   │   ├── lm/
│   │   │   ├── __init__.py
│   │   │   └── loaders/
│   │   │       ├── __init__.py
│   │   │       └── lm_dataset.py
│   │   ├── lu/
│   │   │   ├── __init__.py
│   │   │   └── glue.py
│   │   ├── ner/
│   │   │   ├── __init__.py
│   │   │   ├── conll03.py
│   │   │   ├── loaders/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── json_ner.py
│   │   │   │   └── tsv.py
│   │   │   ├── msra.py
│   │   │   ├── resume.py
│   │   │   └── weibo.py
│   │   ├── parsing/
│   │   │   ├── __init__.py
│   │   │   ├── amr.py
│   │   │   ├── ctb5.py
│   │   │   ├── ctb7.py
│   │   │   ├── ctb8.py
│   │   │   ├── ctb9.py
│   │   │   ├── loaders/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _ctb_utils.py
│   │   │   │   ├── conll_dataset.py
│   │   │   │   └── constituency_dataset.py
│   │   │   ├── pmt1.py
│   │   │   ├── ptb.py
│   │   │   ├── semeval15.py
│   │   │   ├── semeval16.py
│   │   │   └── ud/
│   │   │       ├── __init__.py
│   │   │       ├── ud210.py
│   │   │       ├── ud210m.py
│   │   │       ├── ud23.py
│   │   │       ├── ud23m.py
│   │   │       ├── ud27.py
│   │   │       └── ud27m.py
│   │   ├── pos/
│   │   │   ├── __init__.py
│   │   │   └── ctb5.py
│   │   ├── qa/
│   │   │   ├── __init__.py
│   │   │   └── hotpotqa.py
│   │   ├── srl/
│   │   │   ├── __init__.py
│   │   │   ├── loaders/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── conll2012.py
│   │   │   │   └── ontonotes_loader.py
│   │   │   └── ontonotes5/
│   │   │       ├── __init__.py
│   │   │       ├── _utils.py
│   │   │       ├── chinese.py
│   │   │       └── english.py
│   │   ├── sts/
│   │   │   ├── __init__.py
│   │   │   └── stsb.py
│   │   └── tokenization/
│   │       ├── __init__.py
│   │       ├── ctb6.py
│   │       ├── loaders/
│   │       │   ├── __init__.py
│   │       │   ├── chunking_dataset.py
│   │       │   ├── multi_criteria_cws/
│   │       │   │   ├── __init__.py
│   │       │   │   └── mcws_dataset.py
│   │       │   └── txt.py
│   │       └── sighan2005/
│   │           ├── __init__.py
│   │           ├── as_.py
│   │           ├── cityu.py
│   │           ├── msr.py
│   │           └── pku.py
│   ├── layers/
│   │   ├── __init__.py
│   │   ├── cnn_encoder.py
│   │   ├── crf/
│   │   │   ├── __init__.py
│   │   │   ├── crf.py
│   │   │   ├── crf_layer_tf.py
│   │   │   └── crf_tf.py
│   │   ├── dropout.py
│   │   ├── embeddings/
│   │   │   ├── __init__.py
│   │   │   ├── char_cnn.py
│   │   │   ├── char_cnn_tf.py
│   │   │   ├── char_rnn.py
│   │   │   ├── char_rnn_tf.py
│   │   │   ├── concat_embedding.py
│   │   │   ├── contextual_string_embedding.py
│   │   │   ├── contextual_string_embedding_tf.py
│   │   │   ├── contextual_word_embedding.py
│   │   │   ├── embedding.py
│   │   │   ├── fast_text.py
│   │   │   ├── fast_text_tf.py
│   │   │   ├── util.py
│   │   │   ├── util_tf.py
│   │   │   ├── word2vec.py
│   │   │   └── word2vec_tf.py
│   │   ├── feed_forward.py
│   │   ├── feedforward.py
│   │   ├── scalar_mix.py
│   │   ├── time_distributed.py
│   │   ├── transformers/
│   │   │   ├── __init__.py
│   │   │   ├── encoder.py
│   │   │   ├── loader_tf.py
│   │   │   ├── pt_imports.py
│   │   │   ├── relative_transformer.py
│   │   │   ├── resource.py
│   │   │   ├── tf_imports.py
│   │   │   ├── utils.py
│   │   │   └── utils_tf.py
│   │   └── weight_normalization.py
│   ├── losses/
│   │   ├── __init__.py
│   │   └── sparse_categorical_crossentropy.py
│   ├── metrics/
│   │   ├── __init__.py
│   │   ├── accuracy.py
│   │   ├── amr/
│   │   │   ├── __init__.py
│   │   │   └── smatch_eval.py
│   │   ├── chunking/
│   │   │   ├── __init__.py
│   │   │   ├── binary_chunking_f1.py
│   │   │   ├── bmes_tf.py
│   │   │   ├── chunking_f1.py
│   │   │   ├── chunking_f1_tf.py
│   │   │   ├── conlleval.py
│   │   │   ├── iobes_tf.py
│   │   │   └── sequence_labeling.py
│   │   ├── f1.py
│   │   ├── metric.py
│   │   ├── mtl.py
│   │   ├── parsing/
│   │   │   ├── __init__.py
│   │   │   ├── attachmentscore.py
│   │   │   ├── conllx_eval.py
│   │   │   ├── labeled_f1.py
│   │   │   ├── labeled_f1_tf.py
│   │   │   ├── labeled_score.py
│   │   │   ├── semdep_eval.py
│   │   │   └── span.py
│   │   ├── spearman_correlation.py
│   │   └── srl/
│   │       ├── __init__.py
│   │       └── srlconll.py
│   ├── optimizers/
│   │   ├── __init__.py
│   │   └── adamw/
│   │       ├── __init__.py
│   │       └── optimization.py
│   ├── pretrained/
│   │   ├── __init__.py
│   │   ├── amr.py
│   │   ├── amr2text.py
│   │   ├── classifiers.py
│   │   ├── constituency.py
│   │   ├── dep.py
│   │   ├── eos.py
│   │   ├── fasttext.py
│   │   ├── glove.py
│   │   ├── mtl.py
│   │   ├── ner.py
│   │   ├── pos.py
│   │   ├── rnnlm.py
│   │   ├── sdp.py
│   │   ├── srl.py
│   │   ├── sts.py
│   │   ├── tok.py
│   │   └── word2vec.py
│   ├── transform/
│   │   ├── __init__.py
│   │   ├── conll_tf.py
│   │   ├── glue_tf.py
│   │   ├── table_tf.py
│   │   ├── tacred_tf.py
│   │   ├── text_tf.py
│   │   ├── transformer_tokenizer.py
│   │   ├── tsv_tf.py
│   │   └── txt_tf.py
│   ├── utils/
│   │   ├── __init__.py
│   │   ├── component_util.py
│   │   ├── file_read_backwards/
│   │   │   ├── __init__.py
│   │   │   ├── buffer_work_space.py
│   │   │   └── file_read_backwards.py
│   │   ├── init_util.py
│   │   ├── io_util.py
│   │   ├── lang/
│   │   │   ├── __init__.py
│   │   │   ├── en/
│   │   │   │   ├── __init__.py
│   │   │   │   └── english_tokenizer.py
│   │   │   ├── ja/
│   │   │   │   ├── __init__.py
│   │   │   │   └── bert_tok.py
│   │   │   └── zh/
│   │   │       ├── __init__.py
│   │   │       ├── char_table.py
│   │   │       └── localization.py
│   │   ├── log_util.py
│   │   ├── rules.py
│   │   ├── span_util.py
│   │   ├── string_util.py
│   │   ├── tf_util.py
│   │   ├── time_util.py
│   │   └── torch_util.py
│   └── version.py
├── plugins/
│   ├── README.md
│   ├── hanlp_common/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── hanlp_common/
│   │   │   ├── __init__.py
│   │   │   ├── amr.py
│   │   │   ├── configurable.py
│   │   │   ├── conll.py
│   │   │   ├── constant.py
│   │   │   ├── document.py
│   │   │   ├── io.py
│   │   │   ├── reflection.py
│   │   │   ├── structure.py
│   │   │   ├── util.py
│   │   │   └── visualization.py
│   │   └── setup.py
│   ├── hanlp_demo/
│   │   ├── README.md
│   │   ├── hanlp_demo/
│   │   │   ├── __init__.py
│   │   │   ├── block_windows.py
│   │   │   ├── en/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── demo_amr.py
│   │   │   │   ├── demo_dep.py
│   │   │   │   ├── demo_lm.py
│   │   │   │   ├── demo_ner.py
│   │   │   │   ├── demo_pipeline.py
│   │   │   │   ├── demo_pos.py
│   │   │   │   ├── demo_sdp.py
│   │   │   │   ├── demo_sentiment_analysis.py
│   │   │   │   ├── demo_tok.py
│   │   │   │   └── train_sst2_albert_base.py
│   │   │   ├── ja/
│   │   │   │   ├── __init__.py
│   │   │   │   └── demo_mtl.py
│   │   │   ├── mul/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── demo_lid.py
│   │   │   │   ├── demo_lid_restful.py
│   │   │   │   ├── demo_mtl.py
│   │   │   │   └── train/
│   │   │   │       ├── __init__.py
│   │   │   │       └── mul_base.py
│   │   │   ├── sent_split.py
│   │   │   └── zh/
│   │   │       ├── __init__.py
│   │   │       ├── abstractive_summarization_restful.ipynb
│   │   │       ├── amr_restful.ipynb
│   │   │       ├── amr_stl.ipynb
│   │   │       ├── classification_restful.ipynb
│   │   │       ├── con_mtl.ipynb
│   │   │       ├── con_restful.ipynb
│   │   │       ├── con_stl.ipynb
│   │   │       ├── cor_restful.ipynb
│   │   │       ├── demo_amr.py
│   │   │       ├── demo_custom_dict.py
│   │   │       ├── demo_custom_dict_stl.py
│   │   │       ├── demo_del_tasks.py
│   │   │       ├── demo_document.py
│   │   │       ├── demo_mlm.py
│   │   │       ├── demo_mtl.py
│   │   │       ├── demo_ner_dict.py
│   │   │       ├── demo_parse_constituency.py
│   │   │       ├── demo_pipeline.py
│   │   │       ├── demo_pos_dict.py
│   │   │       ├── demo_sts.py
│   │   │       ├── demo_word2vec.py
│   │   │       ├── dep_mtl.ipynb
│   │   │       ├── dep_restful.ipynb
│   │   │       ├── dep_stl.ipynb
│   │   │       ├── extractive_summarization_restful.ipynb
│   │   │       ├── gec_restful.ipynb
│   │   │       ├── keyphrase_restful.ipynb
│   │   │       ├── lid_restful.ipynb
│   │   │       ├── lid_stl.ipynb
│   │   │       ├── ner_mtl.ipynb
│   │   │       ├── ner_restful.ipynb
│   │   │       ├── ner_stl.ipynb
│   │   │       ├── pos_mtl.ipynb
│   │   │       ├── pos_restful.ipynb
│   │   │       ├── pos_stl.ipynb
│   │   │       ├── sdp_mtl.ipynb
│   │   │       ├── sdp_restful.ipynb
│   │   │       ├── sdp_stl.ipynb
│   │   │       ├── sentiment_restful.ipynb
│   │   │       ├── srl_mtl.ipynb
│   │   │       ├── srl_restful.ipynb
│   │   │       ├── srl_stl.ipynb
│   │   │       ├── sts_restful.ipynb
│   │   │       ├── sts_stl.ipynb
│   │   │       ├── tf/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── demo_classifier.py
│   │   │       │   ├── demo_client.py
│   │   │       │   ├── demo_cws.py
│   │   │       │   ├── demo_cws_trie.py
│   │   │       │   ├── demo_dep.py
│   │   │       │   ├── demo_fasttext.py
│   │   │       │   ├── demo_multiprocess.py
│   │   │       │   ├── demo_ner.py
│   │   │       │   ├── demo_pipeline.py
│   │   │       │   ├── demo_pos.py
│   │   │       │   ├── demo_sdp.py
│   │   │       │   ├── demo_serving.py
│   │   │       │   └── train/
│   │   │       │       ├── __init__.py
│   │   │       │       ├── cws/
│   │   │       │       │   ├── __init__.py
│   │   │       │       │   ├── train_ctb6_cws_albert.py
│   │   │       │       │   ├── train_ctb6_cws_bert.py
│   │   │       │       │   ├── train_ctb6_cws_convseg.py
│   │   │       │       │   ├── train_large_bert_cws.py
│   │   │       │       │   ├── train_large_conv_cws.py
│   │   │       │       │   ├── train_large_cws_albert.py
│   │   │       │       │   ├── train_large_cws_electra.py
│   │   │       │       │   ├── train_large_rnn_cws.py
│   │   │       │       │   ├── train_msr_cws_albert.py
│   │   │       │       │   ├── train_msr_cws_bert.py
│   │   │       │       │   ├── train_msr_cws_ngram_conv.py
│   │   │       │       │   ├── train_msr_cws_ngram_conv_embed.py
│   │   │       │       │   ├── train_pku980106_conv_cws.py
│   │   │       │       │   ├── train_pku980106_rnn_cws.py
│   │   │       │       │   └── train_pku_conv_cws.py
│   │   │       │       ├── finetune_msra_ner_albert.py
│   │   │       │       ├── train_chnsenticorp_bert.py
│   │   │       │       ├── train_conll03_ner_bert.py
│   │   │       │       ├── train_conll03_ner_flair.py
│   │   │       │       ├── train_ctb5_dep.py
│   │   │       │       ├── train_ctb5_pos_rnn.py
│   │   │       │       ├── train_ctb7_dep.py
│   │   │       │       ├── train_ctb9_pos_albert.py
│   │   │       │       ├── train_ctb9_pos_electra.py
│   │   │       │       ├── train_msra_ner_albert.py
│   │   │       │       ├── train_msra_ner_bert.py
│   │   │       │       ├── train_msra_ner_electra.py
│   │   │       │       ├── train_msra_ner_ngram_conv.py
│   │   │       │       ├── train_msra_ner_rnn.py
│   │   │       │       ├── train_ptb_dep_biaffine_albert.py
│   │   │       │       ├── train_ptb_dep_biaffine_bert.py
│   │   │       │       ├── train_ptb_dep_biaffine_bert_96.6.py
│   │   │       │       ├── train_ptb_dep_biaffine_bert_positional.py
│   │   │       │       ├── train_ptb_dep_sa_albert.py
│   │   │       │       ├── train_ptb_dep_sa_albert_topk.py
│   │   │       │       ├── train_ptb_dep_sa_bert.py
│   │   │       │       ├── train_ptb_dep_sa_pos_bert.py
│   │   │       │       ├── train_ptb_pos_rnn_fasttext.py
│   │   │       │       ├── train_semeval15_dm.py
│   │   │       │       ├── train_semeval15_pas.py
│   │   │       │       ├── train_semeval15_psd.py
│   │   │       │       ├── train_semeval16_news.py
│   │   │       │       └── train_semeval16_text.py
│   │   │       ├── tok_mtl.ipynb
│   │   │       ├── tok_restful.ipynb
│   │   │       ├── tok_stl.ipynb
│   │   │       ├── train/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── finetune_ner.py
│   │   │       │   ├── open_base.py
│   │   │       │   └── open_small.py
│   │   │       ├── train_sota_bert_pku.py
│   │   │       ├── tst_restful.ipynb
│   │   │       └── tutorial.ipynb
│   │   └── setup.py
│   ├── hanlp_restful/
│   │   ├── README.md
│   │   ├── hanlp_restful/
│   │   │   └── __init__.py
│   │   ├── setup.py
│   │   └── tests/
│   │       ├── __init__.py
│   │       └── test_client.py
│   ├── hanlp_restful_golang/
│   │   └── README.md
│   ├── hanlp_restful_java/
│   │   ├── pom.xml
│   │   └── src/
│   │       ├── main/
│   │       │   └── java/
│   │       │       └── com/
│   │       │           └── hankcs/
│   │       │               └── hanlp/
│   │       │                   └── restful/
│   │       │                       ├── BaseInput.java
│   │       │                       ├── CoreferenceResolutionOutput.java
│   │       │                       ├── DocumentInput.java
│   │       │                       ├── HanLPClient.java
│   │       │                       ├── SentenceInput.java
│   │       │                       ├── Span.java
│   │       │                       ├── TokenInput.java
│   │       │                       └── mrp/
│   │       │                           ├── Anchor.java
│   │       │                           ├── Edge.java
│   │       │                           ├── MeaningRepresentation.java
│   │       │                           └── Node.java
│   │       └── test/
│   │           └── java/
│   │               └── com/
│   │                   └── hankcs/
│   │                       └── hanlp/
│   │                           └── restful/
│   │                               ├── HanLPClientTest.java
│   │                               └── MeaningRepresentationTest.java
│   └── hanlp_trie/
│       ├── README.md
│       ├── hanlp_trie/
│       │   ├── __init__.py
│       │   ├── dictionary.py
│       │   └── trie.py
│       ├── setup.py
│       └── tests/
│           ├── __init__.py
│           ├── test_trie.py
│           └── test_trie_dict.py
├── setup.py
└── tests/
    ├── __init__.py
    ├── test_config_tracker.py
    ├── test_mtl.py
    ├── test_pipeline.py
    ├── test_rules.py
    └── test_string_util.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: 🐛发现一个bug
about: 需提交版本号、触发代码、错误日志
title: ''
labels: bug
assignees: hankcs

---

<!--
感谢找出bug，请认真填写下表：
-->

**Describe the bug**
A clear and concise description of what the bug is.

**Code to reproduce the issue**
Provide a reproducible test case that is the bare minimum necessary to generate the problem.

```python
```

**Describe the current behavior**
A clear and concise description of what happened.

**Expected behavior**
A clear and concise description of what you expected to happen.

**System information**
- OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
- Python version:
- HanLP version:

**Other info / logs**
Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.

* [ ] I've completed this form and searched the web for solutions.
<!-- ⬆️此处务必勾选，否则你的issue会被机器人自动删除！ -->
<!-- ⬆️此处务必勾选，否则你的issue会被机器人自动删除！ -->
<!-- ⬆️此处务必勾选，否则你的issue会被机器人自动删除！ -->

================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
contact_links:
  - name: ⁉️ 提问求助请上论坛
    url: https://bbs.hankcs.com/
    about: 欢迎前往蝴蝶效应论坛求助


================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: 🚀新功能请愿
about: 建议增加一个新功能
title: ''
labels: feature request
assignees: hankcs

---

<!--
提问请上论坛，不要发这里！
提问请上论坛，不要发这里！
提问请上论坛，不要发这里！

以下必填，否则直接关闭。
-->

**Describe the feature and the current behavior/state.**

**Will this change the current api? How?**

**Who will benefit with this feature?**

**Are you willing to contribute it (Yes/No):**

**System information**
- OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
- Python version:
- HanLP version:

**Any other info**

* [ ] I've carefully completed this form.
<!-- 发表前先搜索，此处一定要勾选！ -->
<!-- 发表前先搜索，此处一定要勾选！ -->
<!-- 发表前先搜索，此处一定要勾选！ -->

================================================
FILE: .github/pull_request_template.md
================================================
<!--
Thank you for being interested in contributing to HanLP! You are awesome ✨.
⚠️Changes must be made on dev branch.
-->

# Title of Your Pull Request

## Description

Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change.

Fixes # (issue)

## Type of Change

Please check any relevant options and delete the rest.

- [ ] Bug fix (non-breaking change which fixes an issue)
- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
- [ ] New feature (non-breaking change which adds functionality)
- [ ] This change requires a documentation update

## How Has This Been Tested?

Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration

## Checklist

Check all items that apply.

- [ ] ⚠️Changes **must** be made on `dev` branch instead of `master`
- [ ] I have added tests that prove my fix is effective or that my feature works
- [ ] New and existing unit tests pass locally with my changes
- [ ] My code follows the style guidelines of this project
- [ ] I have commented my code, particularly in hard-to-understand areas
- [ ] I have made corresponding changes to the documentation
- [ ] My changes generate no new warnings
- [ ] I have checked my code and corrected any misspellings


================================================
FILE: .github/workflows/unit-tests.yml
================================================
name: Unit Tests

on:
  push:
    branches: [ "**" ]
  pull_request:
    branches: [ "**" ]

jobs:
  build:

    runs-on: ${{ matrix.os }}
    env:
      HANLP_HOME: ${{ github.workspace }}/data
    strategy:
      fail-fast: false
      matrix:
        os: [ ubuntu-latest, macos-latest, windows-latest ]
        python-version: [ 3.6, 3.7, 3.8, 3.9, '3.10' ]
        exclude:
          # GHA doesn't list 3.6 for ubuntu-22.04
          - os: ubuntu-latest
            python-version: "3.6"

          # MacOS 14.4.1 for arm64 doesn't support Python < 3.8
          - os: macos-latest
            python-version: "3.6"
          - os: macos-latest
            python-version: "3.7"

        include:
          # GHA doesn't list 3.6 for ubuntu-22
          - os: ubuntu-20.04
            python-version: "3.6"

          # MacOS 13 required for Python < 3.8
          - os: macos-13
            python-version: "3.6"
          - os: macos-13
            python-version: "3.7"

    steps:
      - uses: actions/checkout@v3

      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v3
        with:
          python-version: ${{ matrix.python-version }}

      - name: Install dependencies
        shell: bash
        run: |
          python -m pip install -e plugins/hanlp_trie
          python -m pip install -e plugins/hanlp_common
          python -m pip install -e .
          python -m pip install pytest

      - name: Cache data
        uses: actions/cache@v3
        with:
          path: ${{ env.HANLP_HOME }}
          key: hanlp-data

      - name: Test with pytest
        shell: bash
        run: |
          pytest tests
          pytest plugins/hanlp_trie/tests
  deploy:
    needs: build
    if: github.event_name == 'push' && github.ref == 'refs/heads/master'
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v3
      - name: Install dependencies
        run: |
          python -m pip install setuptools wheel twine
      - name: Deploy to PyPI
        run: |
          python setup.py sdist bdist_wheel
          python -m twine upload dist/*
        env:
          TWINE_USERNAME: __token__
          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
          TWINE_REPOSITORY: pypi


================================================
FILE: .gitignore
================================================
# Created by .ignore support plugin (hsz.mobi)
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

### Java template
# Compiled class file
*.class

# Log file

# BlueJ files
*.ctxt

# Mobile Tools for Java (J2ME)
.mtj.tmp/

# Package Files #
*.jar
*.war
*.nar
*.ear
*.zip
*.tar.gz
*.rar

# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*

### Eclipse template
.metadata
bin/
tmp/
*.tmp
*.bak
*.swp
*~.nib
local.properties
.settings/
.loadpath
.recommenders

# External tool builders
.externalToolBuilders/

# Locally stored "Eclipse launch configurations"
*.launch

# PyDev specific (Python IDE for Eclipse)
*.pydevproject

# CDT-specific (C/C++ Development Tooling)
.cproject

# CDT- autotools
.autotools

# Java annotation processor (APT)
.factorypath

# PDT-specific (PHP Development Tools)
.buildpath

# sbteclipse plugin
.target

# Tern plugin
.tern-project

# TeXlipse plugin
.texlipse

# STS (Spring Tool Suite)
.springBeans

# Code Recommenders
.recommenders/

# Annotation Processing
.apt_generated/

# Scala IDE specific (Scala & Java development for Eclipse)
.cache-main
.scala_dependencies
.worksheet

### VisualStudioCode template
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json

### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# Generated files
.idea/**/contentModel.xml

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn.  Uncomment if using
# auto-import.
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest HanLPClient
.idea/httpRequests

# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
.idea
*.iml
data
.vscode
*.pkl
*.pdf
_static/
_build/
_templates/

================================================
FILE: CITATION.cff
================================================
cff-version: 1.2.0
message: "If you use this software, please cite it as below."
authors:
- family-names: He
  given-names: Han
  orcid: "https://orcid.org/0009-0005-1778-917X"
title: "HanLP: Han Language Processing"
version: 2.1
date-released: 2015-05-27
url: "https://github.com/hankcs/HanLP"
preferred-citation:
  type: conference-paper
  authors:
    - family-names: He
      given-names: Han
    - family-names: Choi
      given-names: Jinho D.
  title: "The Stem Cell Hypothesis: Dilemma behind Multi-Task Learning with Transformer Encoders"
  editors:
    - family-names: Moens
      given-names: Marie-Francine
    - family-names: Huang
      given-names: Xuanjing
    - family-names: Specia
      given-names: Lucia
    - family-names: Yih
      given-names: Scott Wen-tau
  year: 2021
  month: 11
  date-released: 2021-11
  conference:
    name: "2021 Conference on Empirical Methods in Natural Language Processing"
    place: "Online and Punta Cana, Dominican Republic"
    url: "https://aclanthology.org/2021.emnlp-main.451"
  doi: "10.18653/v1/2021.emnlp-main.451"
  url: "https://aclanthology.org/2021.emnlp-main.451"
  publisher: "Association for Computational Linguistics"
  booktitle: "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing"
  location: "Online and Punta Cana, Dominican Republic"
  pages: "5555-5577"


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
<h2 align="center">HanLP: Han Language Processing</h2>

<div align="center">
    <a href="https://github.com/hankcs/HanLP/actions/workflows/unit-tests.yml">
       <img alt="Unit Tests" src="https://github.com/hankcs/hanlp/actions/workflows/unit-tests.yml/badge.svg?branch=master">
    </a>
    <a href="https://pypi.org/project/hanlp/">
        <img alt="PyPI Version" src="https://img.shields.io/pypi/v/hanlp?color=blue">
    </a>
    <a href="https://pypi.org/project/hanlp/">
        <img alt="Python Versions" src="https://img.shields.io/pypi/pyversions/hanlp?colorB=blue">
    </a>
    <a href="https://pepy.tech/project/hanlp">
        <img alt="Downloads" src="https://static.pepy.tech/badge/hanlp">
    </a>
    <a href="https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Ftutorial.ipynb">
        <img alt="在线运行" src="https://mybinder.org/badge_logo.svg">
    </a>
</div>
<h4 align="center">
    <a href="https://github.com/hankcs/HanLP/tree/master">English</a> |
    <a href="https://github.com/hankcs/HanLP/tree/doc-ja">日本語</a> |
    <a href="https://hanlp.hankcs.com/docs/">文档</a> |
    <a href="https://bbs.hankcs.com/t/topic/3940">论文</a> |
    <a href="https://bbs.hankcs.com/">论坛</a> |
    <a href="https://github.com/wangedison/hanlp-jupyterlab-docker">docker</a> |
    <a href="https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Ftutorial.ipynb">▶️在线运行</a>
</h4>


面向生产环境的多语种自然语言处理工具包，基于PyTorch和TensorFlow 2.x双引擎，目标是普及落地最前沿的NLP技术。HanLP具备功能完善、精度准确、性能高效、语料时新、架构清晰、可自定义的特点。

[![demo](https://raw.githubusercontent.com/hankcs/OpenCC-to-HanLP/img/demo.gif)](https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Ftutorial.ipynb)

借助世界上最大的多语种语料库，HanLP2.1支持包括简繁中英日俄法德在内的[130种语言](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/mtl.html#hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L6)上的10种联合任务以及多种单任务。HanLP预训练了十几种任务上的数十个模型并且正在持续迭代语料库与模型：

<div align="center">

| 功能                                                         | RESTful                                                      | 多任务                                                       | 单任务                                                       | 模型                                                         | 标注标准                                                     |
| ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
| [分词](https://hanlp.hankcs.com/demos/tok.html)              | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tok_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tok_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tok_stl.ipynb) | [tok](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/tok.html) | [粗分](https://hanlp.hankcs.com/docs/annotations/tok/msr.html)、[细分](https://hanlp.hankcs.com/docs/annotations/tok/ctb.html) |
| [词性标注](https://hanlp.hankcs.com/demos/pos.html)          | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/pos_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/pos_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/pos_stl.ipynb) | [pos](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/pos.html) | [CTB](https://hanlp.hankcs.com/docs/annotations/pos/ctb.html)、[PKU](https://hanlp.hankcs.com/docs/annotations/pos/pku.html)、[863](https://hanlp.hankcs.com/docs/annotations/pos/863.html) |
| [命名实体识别](https://hanlp.hankcs.com/demos/ner.html)      | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/ner_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/ner_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/ner_stl.ipynb) | [ner](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/ner.html) | [PKU](https://hanlp.hankcs.com/docs/annotations/ner/pku.html)、[MSRA](https://hanlp.hankcs.com/docs/annotations/ner/msra.html)、[OntoNotes](https://hanlp.hankcs.com/docs/annotations/ner/ontonotes.html) |
| [依存句法分析](https://hanlp.hankcs.com/demos/dep.html)      | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/dep_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/dep_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/dep_stl.ipynb) | [dep](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/dep.html) | [SD](https://hanlp.hankcs.com/docs/annotations/dep/sd_zh.html)、[UD](https://hanlp.hankcs.com/docs/annotations/dep/ud.html#chinese)、[PMT](https://hanlp.hankcs.com/docs/annotations/dep/pmt.html) |
| [成分句法分析](https://hanlp.hankcs.com/demos/con.html)      | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/con_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/con_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/con_stl.ipynb) | [con](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/constituency.html) | [Chinese Tree Bank](https://hanlp.hankcs.com/docs/annotations/constituency/ctb.html) |
| [语义依存分析](https://hanlp.hankcs.com/demos/sdp.html)      | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sdp_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sdp_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sdp_stl.ipynb) | [sdp](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/sdp.html) | [CSDP](https://hanlp.hankcs.com/docs/annotations/sdp/semeval16.html#) |
| [语义角色标注](https://hanlp.hankcs.com/demos/srl.html)      | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/srl_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/srl_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/srl_stl.ipynb) | [srl](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/srl.html) | [Chinese Proposition Bank](https://hanlp.hankcs.com/docs/annotations/srl/cpb.html) |
| [抽象意义表示](https://hanlp.hankcs.com/demos/amr.html)      | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/amr_restful.ipynb) | 暂无                                                         | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/amr_stl.ipynb) | [amr](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/amr.html) | [CAMR](https://www.hankcs.com/nlp/corpus/introduction-to-chinese-abstract-meaning-representation.html) |
| [指代消解](https://hanlp.hankcs.com/demos/cor.html)          | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/cor_restful.ipynb) | 暂无                                                         | 暂无                                                         | 暂无                                                         | OntoNotes                                                    |
| [语义文本相似度](https://hanlp.hankcs.com/demos/sts.html)    | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sts_restful.ipynb) | 暂无                                                         | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sts_stl.ipynb) | [sts](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/sts.html) | 暂无                                                         |
| [文本风格转换](https://hanlp.hankcs.com/demos/tst.html)      | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tst_restful.ipynb) | 暂无                                                         | 暂无                                                         | 暂无                                                         | 暂无                                                         |
| [关键词短语提取](https://hanlp.hankcs.com/demos/keyphrase.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/keyphrase_restful.ipynb) | 暂无                                                         | 暂无                                                         | 暂无                                                         | 暂无                                                         |
| [抽取式自动摘要](https://hanlp.hankcs.com/demos/exsum.html)  | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/extractive_summarization_restful.ipynb) | 暂无                                                         | 暂无                                                         | 暂无                                                         | 暂无                                                         |
| [生成式自动摘要](https://hanlp.hankcs.com/demos/absum.html)  | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/abstractive_summarization_restful.ipynb) | 暂无                                                         | 暂无                                                         | 暂无                                                         | 暂无                                                         |
| [文本语法纠错](https://hanlp.hankcs.com/demos/gec.html)      | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/gec_restful.ipynb) | 暂无                                                         | 暂无                                                         | 暂无                                                         | 暂无                                                         |
| [文本分类](https://hanlp.hankcs.com/demos/classification.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/classification_restful.ipynb) | 暂无                                                         | 暂无                                                         | 暂无                                                         | 暂无                                                         |
| [情感分析](https://hanlp.hankcs.com/demos/sentiment.html)    | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sentiment_restful.ipynb) | 暂无                                                         | 暂无                                                         | 暂无                                                         | `[-1,+1]`                                                    |
| [语种检测](https://hanlp.hankcs.com/demos/classification.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/lid_restful.ipynb) | 暂无                                                         | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/lid_stl.ipynb) | 暂无                                                         | [ISO 639-1编码](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) |

</div>

- 词干提取、词法语法特征提取请参考[英文教程](https://hanlp.hankcs.com/docs/tutorial.html)；[词向量](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/word2vec.html)和[完形填空](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/mlm.html)请参考相应文档。
- 简繁转换、拼音、新词发现、文本聚类请参考[1.x教程](https://github.com/hankcs/HanLP/tree/1.x)。

量体裁衣，HanLP提供**RESTful**和**native**两种API，分别面向轻量级和海量级两种场景。无论何种API何种语言，HanLP接口在语义上保持一致，在代码上坚持开源。如果您在研究中使用了HanLP，请引用我们的[EMNLP论文](https://aclanthology.org/2021.emnlp-main.451/)。

### 轻量级RESTful API

仅数KB，适合敏捷开发、移动APP等场景。简单易用，无需GPU配环境，秒速安装。语料更多、模型更大、精度更高，**强烈推荐**。服务器GPU算力有限，匿名用户配额较少，[建议申请**免费公益**API秘钥`auth`](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。

#### Python

```shell
pip install hanlp_restful
```

创建客户端，填入服务器地址和秘钥：

```python
from hanlp_restful import HanLPClient
HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种
```

#### Golang

安装 `go get -u github.com/hankcs/gohanlp@main` ，创建客户端，填入服务器地址和秘钥：

```go
HanLP := hanlp.HanLPClient(hanlp.WithAuth(""),hanlp.WithLanguage("zh")) // auth不填则匿名，zh中文，mul多语种
```

#### Java

在`pom.xml`中添加依赖：

```xml
<dependency>
    <groupId>com.hankcs.hanlp.restful</groupId>
    <artifactId>hanlp-restful</artifactId>
    <version>0.0.12</version>
</dependency>
```

创建客户端，填入服务器地址和秘钥：

```java
HanLPClient HanLP = new HanLPClient("https://www.hanlp.com/api", null, "zh"); // auth不填则匿名，zh中文，mul多语种
```

#### 快速上手

无论何种开发语言，调用`parse`接口，传入一篇文章，得到HanLP精准的分析结果。

```java
HanLP.parse("2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。")
```

更多功能包括语义相似度、风格转换、指代消解等，请参考[文档](https://hanlp.hankcs.com/docs/api/restful.html)和[测试用例](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_restful/tests/test_client.py)。

### 海量级native API

依赖PyTorch、TensorFlow等深度学习技术，适合**专业**NLP工程师、研究者以及本地海量数据场景。要求Python 3.6至3.10，支持Windows，推荐*nix。可以在CPU上运行，推荐GPU/TPU。安装PyTorch版：

```bash
pip install hanlp
```

- HanLP每次发布都通过了Linux、macOS和Windows上Python3.6至3.10的[单元测试](https://github.com/hankcs/HanLP/actions?query=branch%3Amaster)，不存在安装问题。

HanLP发布的模型分为多任务和单任务两种，多任务速度快省显存，单任务精度高更灵活。

#### 多任务模型

HanLP的工作流程为加载模型然后将其当作函数调用，例如下列联合多任务模型：

```python
import hanlp
HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH) # 世界最大中文语料库
HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'])
```

Native API的输入单位为句子，需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful和native两种API的语义设计完全一致，用户可以无缝互换。简洁的接口也支持灵活的参数，常用的技巧有：

- 灵活的`tasks`任务调度，任务越少，速度越快，详见[教程](https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Ftutorial.ipynb)。在内存有限的场景下，用户还可以[删除不需要的任务](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/demo_del_tasks.py)达到模型瘦身的效果。
- 高效的trie树自定义词典，以及强制、合并、校正3种规则，请参考[demo](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tok_mtl.ipynb)和[文档](https://hanlp.hankcs.com/docs/api/hanlp/components/tokenizers/transformer.html)。规则系统的效果将无缝应用到后续统计模型，从而快速适应新领域。

#### 单任务模型

根据我们的[最新研究](https://aclanthology.org/2021.emnlp-main.451)，多任务学习的优势在于速度和显存，然而精度往往不如单任务模型。所以，HanLP预训练了许多单任务模型并设计了优雅的[流水线模式](https://hanlp.hankcs.com/docs/api/hanlp/components/pipeline.html#hanlp.components.pipeline.Pipeline)将其组装起来。

```python
import hanlp
HanLP = hanlp.pipeline() \
    .append(hanlp.utils.rules.split_sentence, output_key='sentences') \
    .append(hanlp.load('FINE_ELECTRA_SMALL_ZH'), output_key='tok') \
    .append(hanlp.load('CTB9_POS_ELECTRA_SMALL'), output_key='pos') \
    .append(hanlp.load('MSRA_NER_ELECTRA_SMALL_ZH'), output_key='ner', input_key='tok') \
    .append(hanlp.load('CTB9_DEP_ELECTRA_SMALL', conll=0), output_key='dep', input_key='tok')\
    .append(hanlp.load('CTB9_CON_ELECTRA_SMALL'), output_key='con', input_key='tok')
HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。')
```

更多功能，请参考[demo](https://github.com/hankcs/HanLP/tree/doc-zh/plugins/hanlp_demo/hanlp_demo/zh)和[文档](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/index.html)了解更多模型与用法。

### 输出格式

无论何种API何种开发语言何种自然语言，HanLP的输出统一为`json`格式兼容`dict`的[`Document`](https://hanlp.hankcs.com/docs/api/common/document.html):

```json
{
  "tok/fine": [
    ["2021年", "HanLPv2.1", "为", "生产", "环境", "带来", "次", "世代", "最", "先进", "的", "多", "语种", "NLP", "技术", "。"],
    ["阿婆主", "来到", "北京", "立方庭", "参观", "自然", "语义", "科技", "公司", "。"]
  ],
  "tok/coarse": [
    ["2021年", "HanLPv2.1", "为", "生产", "环境", "带来", "次世代", "最", "先进", "的", "多语种", "NLP", "技术", "。"],
    ["阿婆主", "来到", "北京立方庭", "参观", "自然语义科技公司", "。"]
  ],
  "pos/ctb": [
    ["NT", "NR", "P", "NN", "NN", "VV", "JJ", "NN", "AD", "JJ", "DEG", "CD", "NN", "NR", "NN", "PU"],
    ["NN", "VV", "NR", "NR", "VV", "NN", "NN", "NN", "NN", "PU"]
  ],
  "pos/pku": [
    ["t", "nx", "p", "vn", "n", "v", "b", "n", "d", "a", "u", "a", "n", "nx", "n", "w"],
    ["n", "v", "ns", "ns", "v", "n", "n", "n", "n", "w"]
  ],
  "pos/863": [
    ["nt", "w", "p", "v", "n", "v", "a", "nt", "d", "a", "u", "a", "n", "ws", "n", "w"],
    ["n", "v", "ns", "n", "v", "n", "n", "n", "n", "w"]
  ],
  "ner/pku": [
    [],
    [["北京立方庭", "ns", 2, 4], ["自然语义科技公司", "nt", 5, 9]]
  ],
  "ner/msra": [
    [["2021年", "DATE", 0, 1], ["HanLPv2.1", "ORGANIZATION", 1, 2]],
    [["北京", "LOCATION", 2, 3], ["立方庭", "LOCATION", 3, 4], ["自然语义科技公司", "ORGANIZATION", 5, 9]]
  ],
  "ner/ontonotes": [
    [["2021年", "DATE", 0, 1], ["HanLPv2.1", "ORG", 1, 2]],
    [["北京立方庭", "FAC", 2, 4], ["自然语义科技公司", "ORG", 5, 9]]
  ],
  "srl": [
    [[["2021年", "ARGM-TMP", 0, 1], ["HanLPv2.1", "ARG0", 1, 2], ["为生产环境", "ARG2", 2, 5], ["带来", "PRED", 5, 6], ["次世代最先进的多语种NLP技术", "ARG1", 6, 15]], [["最", "ARGM-ADV", 8, 9], ["先进", "PRED", 9, 10], ["技术", "ARG0", 14, 15]]],
    [[["阿婆主", "ARG0", 0, 1], ["来到", "PRED", 1, 2], ["北京立方庭", "ARG1", 2, 4]], [["阿婆主", "ARG0", 0, 1], ["参观", "PRED", 4, 5], ["自然语义科技公司", "ARG1", 5, 9]]]
  ],
  "dep": [
    [[6, "tmod"], [6, "nsubj"], [6, "prep"], [5, "nn"], [3, "pobj"], [0, "root"], [8, "amod"], [15, "nn"], [10, "advmod"], [15, "rcmod"], [10, "assm"], [13, "nummod"], [15, "nn"], [15, "nn"], [6, "dobj"], [6, "punct"]],
    [[2, "nsubj"], [0, "root"], [4, "nn"], [2, "dobj"], [2, "conj"], [9, "nn"], [9, "nn"], [9, "nn"], [5, "dobj"], [2, "punct"]]
  ],
  "sdp": [
    [[[6, "Time"]], [[6, "Exp"]], [[5, "mPrep"]], [[5, "Desc"]], [[6, "Datv"]], [[13, "dDesc"]], [[0, "Root"], [8, "Desc"], [13, "Desc"]], [[15, "Time"]], [[10, "mDegr"]], [[15, "Desc"]], [[10, "mAux"]], [[8, "Quan"], [13, "Quan"]], [[15, "Desc"]], [[15, "Nmod"]], [[6, "Pat"]], [[6, "mPunc"]]],
    [[[2, "Agt"], [5, "Agt"]], [[0, "Root"]], [[4, "Loc"]], [[2, "Lfin"]], [[2, "ePurp"]], [[8, "Nmod"]], [[9, "Nmod"]], [[9, "Nmod"]], [[5, "Datv"]], [[5, "mPunc"]]]
  ],
  "con": [
    ["TOP", [["IP", [["NP", [["NT", ["2021年"]]]], ["NP", [["NR", ["HanLPv2.1"]]]], ["VP", [["PP", [["P", ["为"]], ["NP", [["NN", ["生产"]], ["NN", ["环境"]]]]]], ["VP", [["VV", ["带来"]], ["NP", [["ADJP", [["NP", [["ADJP", [["JJ", ["次"]]]], ["NP", [["NN", ["世代"]]]]]], ["ADVP", [["AD", ["最"]]]], ["VP", [["JJ", ["先进"]]]]]], ["DEG", ["的"]], ["NP", [["QP", [["CD", ["多"]]]], ["NP", [["NN", ["语种"]]]]]], ["NP", [["NR", ["NLP"]], ["NN", ["技术"]]]]]]]]]], ["PU", ["。"]]]]]],
    ["TOP", [["IP", [["NP", [["NN", ["阿婆主"]]]], ["VP", [["VP", [["VV", ["来到"]], ["NP", [["NR", ["北京"]], ["NR", ["立方庭"]]]]]], ["VP", [["VV", ["参观"]], ["NP", [["NN", ["自然"]], ["NN", ["语义"]], ["NN", ["科技"]], ["NN", ["公司"]]]]]]]], ["PU", ["。"]]]]]]
  ]
}
```

特别地，Python RESTful和native API支持基于等宽字体的[可视化](https://hanlp.hankcs.com/docs/tutorial.html#visualization)，能够直接将语言学结构在控制台内可视化出来：

```python
HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。']).pretty_print()

Dep Tree    	Token    	Relati	PoS	Tok      	NER Type        	Tok      	SRL PA1     	Tok      	SRL PA2     	Tok      	PoS    3       4       5       6       7       8       9 
────────────	─────────	──────	───	─────────	────────────────	─────────	────────────	─────────	────────────	─────────	─────────────────────────────────────────────────────────
 ┌─────────►	2021年    	tmod  	NT 	2021年    	───►DATE        	2021年    	───►ARGM-TMP	2021年    	            	2021年    	NT ───────────────────────────────────────────►NP ───┐   
 │┌────────►	HanLPv2.1	nsubj 	NR 	HanLPv2.1	───►ORGANIZATION	HanLPv2.1	───►ARG0    	HanLPv2.1	            	HanLPv2.1	NR ───────────────────────────────────────────►NP────┤   
 ││┌─►┌─────	为        	prep  	P  	为        	                	为        	◄─┐         	为        	            	为        	P ───────────┐                                       │   
 │││  │  ┌─►	生产       	nn    	NN 	生产       	                	生产       	  ├►ARG2    	生产       	            	生产       	NN ──┐       ├────────────────────────►PP ───┐       │   
 │││  └─►└──	环境       	pobj  	NN 	环境       	                	环境       	◄─┘         	环境       	            	环境       	NN ──┴►NP ───┘                               │       │   
┌┼┴┴────────	带来       	root  	VV 	带来       	                	带来       	╟──►PRED    	带来       	            	带来       	VV ──────────────────────────────────┐       │       │   
││       ┌─►	次        	amod  	JJ 	次        	                	次        	◄─┐         	次        	            	次        	JJ ───►ADJP──┐                       │       ├►VP────┤   
││  ┌───►└──	世代       	nn    	NN 	世代       	                	世代       	  │         	世代       	            	世代       	NN ───►NP ───┴►NP ───┐               │       │       │   
││  │    ┌─►	最        	advmod	AD 	最        	                	最        	  │         	最        	───►ARGM-ADV	最        	AD ───────────►ADVP──┼►ADJP──┐       ├►VP ───┘       ├►IP
││  │┌──►├──	先进       	rcmod 	JJ 	先进       	                	先进       	  │         	先进       	╟──►PRED    	先进       	JJ ───────────►VP ───┘       │       │               │   
││  ││   └─►	的        	assm  	DEG	的        	                	的        	  ├►ARG1    	的        	            	的        	DEG──────────────────────────┤       │               │   
││  ││   ┌─►	多        	nummod	CD 	多        	                	多        	  │         	多        	            	多        	CD ───►QP ───┐               ├►NP ───┘               │   
││  ││┌─►└──	语种       	nn    	NN 	语种       	                	语种       	  │         	语种       	            	语种       	NN ───►NP ───┴────────►NP────┤                       │   
││  │││  ┌─►	NLP      	nn    	NR 	NLP      	                	NLP      	  │         	NLP      	            	NLP      	NR ──┐                       │                       │   
│└─►└┴┴──┴──	技术       	dobj  	NN 	技术       	                	技术       	◄─┘         	技术       	───►ARG0    	技术       	NN ──┴────────────────►NP ───┘                       │   
└──────────►	。        	punct 	PU 	。        	                	。        	            	。        	            	。        	PU ──────────────────────────────────────────────────┘   

Dep Tree    	Tok	Relat	Po	Tok	NER Type        	Tok	SRL PA1 	Tok	SRL PA2 	Tok	Po    3       4       5       6 
────────────	───	─────	──	───	────────────────	───	────────	───	────────	───	────────────────────────────────
         ┌─►	阿婆主	nsubj	NN	阿婆主	                	阿婆主	───►ARG0	阿婆主	───►ARG0	阿婆主	NN───────────────────►NP ───┐   
┌┬────┬──┴──	来到 	root 	VV	来到 	                	来到 	╟──►PRED	来到 	        	来到 	VV──────────┐               │   
││    │  ┌─►	北京 	nn   	NR	北京 	───►LOCATION    	北京 	◄─┐     	北京 	        	北京 	NR──┐       ├►VP ───┐       │   
││    └─►└──	立方庭	dobj 	NR	立方庭	───►LOCATION    	立方庭	◄─┴►ARG1	立方庭	        	立方庭	NR──┴►NP ───┘       │       │   
│└─►┌───────	参观 	conj 	VV	参观 	                	参观 	        	参观 	╟──►PRED	参观 	VV──────────┐       ├►VP────┤   
│   │  ┌───►	自然 	nn   	NN	自然 	◄─┐             	自然 	        	自然 	◄─┐     	自然 	NN──┐       │       │       ├►IP
│   │  │┌──►	语义 	nn   	NN	语义 	  │             	语义 	        	语义 	  │     	语义 	NN  │       ├►VP ───┘       │   
│   │  ││┌─►	科技 	nn   	NN	科技 	  ├►ORGANIZATION	科技 	        	科技 	  ├►ARG1	科技 	NN  ├►NP ───┘               │   
│   └─►└┴┴──	公司 	dobj 	NN	公司 	◄─┘             	公司 	        	公司 	◄─┘     	公司 	NN──┘                       │   
└──────────►	。  	punct	PU	。  	                	。  	        	。  	        	。  	PU──────────────────────────┘   
```

关于标注集含义，请参考[《语言学标注规范》](https://hanlp.hankcs.com/docs/annotations/index.html)及[《格式规范》](https://hanlp.hankcs.com/docs/data_format.html)。我们购买、标注或采用了世界上量级最大、种类最多的语料库用于联合多语种多任务学习，所以HanLP的标注集也是覆盖面最广的。

## 训练你自己的领域模型

写深度学习模型一点都不难，难的是复现较高的准确率。下列[代码](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/train_sota_bert_pku.py)展示了如何在sighan2005 PKU语料库上花6分钟训练一个超越学术界state-of-the-art的中文分词模型。

```python
tokenizer = TransformerTaggingTokenizer()
save_dir = 'data/model/cws/sighan2005_pku_bert_base_96.73'
tokenizer.fit(
    SIGHAN2005_PKU_TRAIN_ALL,
    SIGHAN2005_PKU_TEST,  # Conventionally, no devset is used. See Tian et al. (2020).
    save_dir,
    'bert-base-chinese',
    max_seq_len=300,
    char_level=True,
    hard_constraint=True,
    sampler_builder=SortingSamplerBuilder(batch_size=32),
    epochs=3,
    adam_epsilon=1e-6,
    warmup_steps=0.1,
    weight_decay=0.01,
    word_dropout=0.1,
    seed=1660853059,
)
tokenizer.evaluate(SIGHAN2005_PKU_TEST, save_dir)
```

其中，由于指定了随机数种子，结果一定是`96.73`。不同于那些虚假宣传的学术论文或商业项目，HanLP保证所有结果可复现。如果你有任何质疑，我们将当作最高优先级的致命性bug第一时间排查问题。

请参考[demo](https://github.com/hankcs/HanLP/tree/master/plugins/hanlp_demo/hanlp_demo/zh/train)了解更多训练脚本。

## 性能

<table><thead><tr><th rowspan="2">lang</th><th rowspan="2">corpora</th><th rowspan="2">model</th><th colspan="2">tok</th><th colspan="4">pos</th><th colspan="3">ner</th><th rowspan="2">dep</th><th rowspan="2">con</th><th rowspan="2">srl</th><th colspan="4">sdp</th><th rowspan="2">lem</th><th rowspan="2">fea</th><th rowspan="2">amr</th></tr><tr><th>fine</th><th>coarse</th><th>ctb</th><th>pku</th><th>863</th><th>ud</th><th>pku</th><th>msra</th><th>ontonotes</th><th>SemEval16</th><th>DM</th><th>PAS</th><th>PSD</th></tr></thead><tbody><tr><td rowspan="2">mul</td><td rowspan="2">UD2.7<br>OntoNotes5</td><td>small</td><td>98.62</td><td>-</td><td>-</td><td>-</td><td>-</td><td>93.23</td><td>-</td><td>-</td><td>74.42</td><td>79.10</td><td>76.85</td><td>70.63</td><td>-</td><td>91.19</td><td>93.67</td><td>85.34</td><td>87.71</td><td>84.51</td><td>-</td></tr><tr><td>base</td><td>98.97</td><td>-</td><td>-</td><td>-</td><td>-</td><td>90.32</td><td>-</td><td>-</td><td>80.32</td><td>78.74</td><td>71.23</td><td>73.63</td><td>-</td><td>92.60</td><td>96.04</td><td>81.19</td><td>85.08</td><td>82.13</td><td>-</td></tr><tr><td rowspan="5">zh</td><td rowspan="2">open</td><td>small</td><td>97.25</td><td>-</td><td>96.66</td><td>-</td><td>-</td><td>-</td><td>-</td><td>-</td><td>95.00</td><td>84.57</td><td>87.62</td><td>73.40</td><td>84.57</td><td>-</td><td>-</td><td>-</td><td>-</td><td>-</td><td>-</td></tr><tr><td>base</td><td>97.50</td><td>-</td><td>97.07</td><td>-</td><td>-</td><td>-</td><td>-</td><td>-</td><td>96.04</td><td>87.11</td><td>89.84</td><td>77.78</td><td>87.11</td><td>-</td><td>-</td><td>-</td><td>-</td><td>-</td><td>-</td></tr><tr><td rowspan="3">close</td><td>small</td><td>96.70</td><td>95.93</td><td>96.87</td><td>97.56</td><td>95.05</td><td>-</td><td>96.22</td><td>95.74</td><td>76.79</td><td>84.44</td><td>88.13</td><td>75.81</td><td>74.28</td><td>-</td><td>-</td><td>-</td><td>-</td><td>-</td><td>-</td></tr><tr><td>base</td><td>97.52</td><td>96.44</td><td>96.99</td><td>97.59</td><td>95.29</td><td>-</td><td>96.48</td><td>95.72</td><td>77.77</td><td>85.29</td><td>88.57</td><td>76.52</td><td>73.76</td><td>-</td><td>-</td><td>-</td><td>-</td><td>-</td><td>-</td></tr><tr><td>ernie</td><td>96.95</td><td>97.29</td><td>96.76</td><td>97.64</td><td>95.22</td><td>-</td><td>97.31</td><td>96.47</td><td>77.95</td><td>85.67</td><td>89.17</td><td>78.51</td><td>74.10</td><td>-</td><td>-</td><td>-</td><td>-</td><td>-</td><td>-</td></tr></tbody></table>

- 根据我们的[最新研究](https://aclanthology.org/2021.emnlp-main.451)，单任务学习的性能往往优于多任务学习。在乎精度甚于速度的话，建议使用[单任务模型](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/index.html)。

HanLP采用的数据预处理与拆分比例与流行方法未必相同，比如HanLP采用了[完整版的MSRA命名实体识别语料](https://bbs.hankcs.com/t/topic/3033)，而非大众使用的阉割版；HanLP使用了语法覆盖更广的[Stanford Dependencies标准](https://hanlp.hankcs.com/docs/annotations/dep/sd_zh.html)，而非学术界沿用的Zhang and Clark (2008)标准；HanLP提出了[均匀分割CTB的方法](https://bbs.hankcs.com/t/topic/3024)，而不采用学术界不均匀且遗漏了51个黄金文件的方法。HanLP开源了[一整套语料预处理脚本与相应语料库](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/train/open_small.py)，力图推动中文NLP的透明化。

总之，HanLP只做我们认为正确、先进的事情，而不一定是流行、权威的事情。

## 引用

如果你在研究中使用了HanLP，请按如下格式引用：

```bibtex
@inproceedings{he-choi-2021-stem,
    title = "The Stem Cell Hypothesis: Dilemma behind Multi-Task Learning with Transformer Encoders",
    author = "He, Han and Choi, Jinho D.",
    booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
    month = nov,
    year = "2021",
    address = "Online and Punta Cana, Dominican Republic",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.emnlp-main.451",
    pages = "5555--5577",
    abstract = "Multi-task learning with transformer encoders (MTL) has emerged as a powerful technique to improve performance on closely-related tasks for both accuracy and efficiency while a question still remains whether or not it would perform as well on tasks that are distinct in nature. We first present MTL results on five NLP tasks, POS, NER, DEP, CON, and SRL, and depict its deficiency over single-task learning. We then conduct an extensive pruning analysis to show that a certain set of attention heads get claimed by most tasks during MTL, who interfere with one another to fine-tune those heads for their own objectives. Based on this finding, we propose the Stem Cell Hypothesis to reveal the existence of attention heads naturally talented for many tasks that cannot be jointly trained to create adequate embeddings for all of those tasks. Finally, we design novel parameter-free probes to justify our hypothesis and demonstrate how attention heads are transformed across the five tasks during MTL through label analysis.",
}
```

## License

### 源代码

HanLP源代码的授权协议为 **Apache License 2.0**，可免费用做商业用途。请在产品说明中附加HanLP的链接和授权协议。HanLP受版权法保护，侵权必究。

##### 自然语义（青岛）科技有限公司

HanLP从v1.7版起独立运作，由自然语义（青岛）科技有限公司作为项目主体，主导后续版本的开发，并拥有后续版本的版权。

##### 上海林原公司

HanLP 早期得到了上海林原公司的大力支持，并拥有1.28及前序版本的版权，相关版本也曾在上海林原公司网站发布。

### 预训练模型

机器学习模型的授权在法律上没有定论，但本着尊重开源语料库原始授权的精神，如不特别说明，HanLP的多语种模型授权沿用[CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/)，中文模型授权为仅供研究与教学使用。

## References

https://hanlp.hankcs.com/docs/references.html


================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS    ?=
SPHINXBUILD   ?= sphinx-build
SOURCEDIR     = .
BUILDDIR      = _build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


================================================
FILE: docs/annotations/constituency/ctb.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# Chinese Tree Bank

See also [The Bracketing Guidelines for the Penn Chinese Treebank (3.0)](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1040&context=ircs_reports).

| Tag  | Definition                             | 定义                                     | 例子            |
|------|----------------------------------------------|----------------------------------------------------|-------------------|
| ADJP | adjective phrase                             | 形容词短语，以形容词为中心词                                     | 不完全、大型            |
| ADVP | adverbial phrase headed by AD (adverb)       | 副词短语，以副词为中心词                                       | 非常、很              |
| CLP  | classifier phrase                            | 由量词构成的短语                                           | 系列、大批             |
| CP   | clause headed by C (complementizer)          | 从句，通过带补语（如“的”、“吗”等）                                | 张三喜欢李四吗？          |
| DNP  | phrase formed by ‘‘XP + DEG’’                | 结构为XP + DEG(的)的短语，其中XP可以是ADJP、DP、QP、PP等等，用于修饰名词短语。 | 大型的、前几年的、五年的、在上海的 |
| DP   | determiner phrase                            | 限定词短语，通常由限定词和数量词构成                                 | 这三个、任何            |
| DVP  | phrase formed by ‘‘XP + DEV’’                | 结构为XP+地的短评，用于修饰动词短语VP                              | 心情失落地、大批地         |
| FRAG | fragment                                     | 片段                                                 | (完）               |
| INTJ | interjection                                 | 插话，感叹语                                             | 哈哈、切              |
| IP   | simple clause headed by I (INFL)             | 简单子句或句子，通常不带补语（如“的”、“吗”等）                          | 张三喜欢李四。           |
| LCP  | phrase formed by ‘‘XP + LC’’                 | 用于表本地点+方位词（LC)的短语                                  | 生活中、田野上           |
| LST  | list marker                                  | 列表短语，包括标点符号                                        | 一.                |
| MSP  | some particles                               | 其他小品词                                              | 所、而、来、去           |
| NN   | common noun                                  | 名词                                                 | HanLP、技术          |
| NP   | noun phrase                                  | 名词短语，中心词通常为名词                                      | 美好生活、经济水平         |
| PP   | preposition phrase                           | 介词短语，中心词通常为介词                                      | 在北京、据报道           |
| PRN  | parenthetical                                | 插入语                                                | ，（张三说)，           |
| QP   | quantifier phrase                            | 量词短语                                               | 三个、五百辆            |
| TOP  | root node                                    | 根节点                                                | 根节点               |
| UCP  | unidentical coordination phrase              | 不对称的并列短语，指并列词两侧的短语类型不致                             | (养老、医疗）保险         |
| VCD  | coordinated verb compound                    | 复合动词                                               | 出版发行              |
| VCP  | verb compounds formed by VV + VC             | VV + VC形式的动词短语                                     | 看作是               |
| VNV  | verb compounds formed by A-not-A or A-one-A  | V不V形式的动词短语                                         | 能不能、信不信           |
| VP   | verb phrase                                  | 动词短语，中心词通常为动词                                      | 完成任务、努力工作         |
| VPT  | potential form V-de-R or V-bu-R              | V不R、V得R形式的动词短语                                     | 打不赢、打得过           |
| VRD  | verb resultative compound                    | 动补结构短语                                             | 研制成功、降下来          |
| VSB  | verb compounds formed by a modifier + a head | 修饰语+中心词构成的动词短语                                     | 拿来支付、仰头望去         |

================================================
FILE: docs/annotations/constituency/index.md
================================================
# Constituency Parsing

## Chinese
```{toctree}
ctb
```

## English
```{toctree}
ptb
```

## Japanese
```{toctree}
npcmj
```


================================================
FILE: docs/annotations/constituency/npcmj.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# NPCMJ

| Tag             | Description                             |
|-----------------|-----------------------------------------|
| ADVP            | adverb phrase                           |
| ADVP-CMPL       | complement adverb phrase                |
| ADVP-MSR        | measurement adverb phrase               |
| ADVP-PRD        | predicate adverb phrase                 |
| ADVP-TMP        | temporal adverb phrase                  |
| CONJP           | conjunction phrase                      |
| CP-EXL          | exclamative                             |
| CP-IMP          | imperative                              |
| CP-FINAL        | projection for sentence final particle  |
| CP-QUE          | question (direct or indirect)           |
| CP-QUE-ADV      | question used adverbially               |
| CP-QUE-OB1      | question used as object                 |
| CP-QUE-PRD      | question used as a nominal predicate    |
| CP-QUE-SBJ      | question used as subject                |
| CP-THT          | complementizer clause                   |
| CP-THT-ADV      | complementizer clause used adverbially  |
| CP-THT-OB1      | complementizer clause used as object    |
| CP-THT-PRD      | complementizer clause used as predicate |
| CP-THT-PRP      | purposive complementizer clause         |
| CP-THT-SBJ      | complementizer clause used as subject   |
| FRAG            | fragment                                |
| FS              | false start                             |
| INTJP           | interjection phrase                     |
| IP-ADV          | adverbial clause                        |
| IP-ADV-CONJ     | coordinated clause                      |
| IP-ADV-PRD      | adverbial clause used as predicate      |
| IP-ADV-SCON     | subordinate clause                      |
| IP-ADV-SCON-CND |                                         |
| conditional     | clause                                  |
| IP-EMB          | gapless noun-modifying clause           |
| IP-IMP          | imperative clause                       |
| IP-MAT          | matrix clause                           |
| IP-NMZ          | nominalized clause                      |
| IP-NMZ-PRD      | nominalized clause used as predicate    |
| IP-REL          | relative clause                         |
| IP-SMC          | small clause                            |
| IP-SMC-CNT      | small clause in continuative form       |
| IP-SMC-OB1      | small clause used as object             |
| IP-SMC-SBJ      | small clause used as subject            |
| IP-SUB          | clause under CP* layer                  |
| multi-sentence  | multiple sentence                       |
| NML             | intermediate nominal layer              |
| NP              | noun phrase                             |
| NP-ADV          | adverbial noun phrase                   |
| NP-CZZ          | causee noun phrase                      |
| NP-DOB1         | derived primary object noun phrase      |
| NP-DSBJ         | derived subject noun phrase             |
| NP-LGS          | logical subject noun phrase             |
| NP-LOC          | locational noun phrase                  |
| NP-MSR          | measure noun phrase                     |
| NP-OB1          | primary object noun phrase              |
| NP-OB2          | secondary object noun phrase            |
| NP-POS          | possessive noun phrase                  |
| NP-PRD          | predicate noun phrase                   |
| NP-SBJ          | subject noun phrase                     |
| NP-SBJ2         | secondary subject noun phrase           |
| NP-TMP          | temporal noun phrase                    |
| NP-TPC          | topic noun phrase                       |
| NP-VOC          | vocative noun phrase                    |
| NUMCLP          | numeral-classifier phrase               |
| PNLP            | prenominal phrase                       |
| PP              | particle phrase                         |
| PP-ADV          | adverbial particle phrase               |
| PP-CMPL         | complement particle phrase              |
| PP-CONJ         | coordination particle phrase            |
| PP-CZZ          | causee particle phrase                  |
| PP-DOB1         | derived primary object particle phrase  |
| PP-DSBJ         | derived subject particle phrase         |
| PP-LGS          | logical subject particle phrase         |
| PP-LOC          | locational particle phrase              |
| PP-MSR          | measure particle phrase                 |
| PP-OB1          | primary object particle phrase          |
| PP-OB2          | secondary object particle phrase        |
| PP-PRD          | predicate particle phrase               |
| PP-PRP          | purpositive particle phrase             |
| PP-SBJ          | subject particle phrase                 |
| PP-SBJ2         | secondary subject particle phrase       |
| PP-SCON         | subordination particle phrase           |
| PP-SCON-CND     | conditional particle phrase             |
| PP-TMP          | temporal particle phrase                |
| PP-TPC          | topic particle phrase                   |
| PP-VOC          | vocative particle phrase                |
| PRN             | parenthetical                           |

================================================
FILE: docs/annotations/constituency/ptb.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# Penn Treebank

| Tag    | Description                                                                                                                                                                                                         |
|--------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| ADJP   | Adjective Phrase.                                                                                                                                                                                                   |
| ADVP   | Adverb Phrase.                                                                                                                                                                                                      |
| CONJP  | Conjunction Phrase.                                                                                                                                                                                                 |
| FRAG   | Fragment.                                                                                                                                                                                                           |
| INTJ   | Interjection. Corresponds approximately to the part-of-speech tag UH.                                                                                                                                               |
| LST    | List marker. Includes surrounding punctuation.                                                                                                                                                                      |
| NAC    | Not a Constituent; used to show the scope of certain prenominal modifiers within an NP.                                                                                                                             |
| NP     | Noun Phrase.                                                                                                                                                                                                        |
| NX     | - Used within certain complex NPs to mark the head of the NP. Corresponds very roughly to N-bar level but used quite differently.                                                                                   |
| PP     | Prepositional Phrase.                                                                                                                                                                                               |
| PRN    | Parenthetical                                                                                                                                                                                                       |
| PRT    | Particle. Category for words that should be tagged RP.                                                                                                                                                              |
| QP     | Quantifier Phrase (i.e. complex measure/amount phrase); used within NP.                                                                                                                                             |
| ROOT   | No description                                                                                                                                                                                                      |
| RRC    | Reduced Relative Clause.                                                                                                                                                                                            |
| S      | conjunction or a wh-word and that does not exhibit subject-verb inversion.                                                                                                                                          |
| SBAR   | Clause introduced by a (possibly empty) subordinating conjunction.                                                                                                                                                  |
| SBARQ  | - Direct question introduced by a wh-word or a wh-phrase. Indirect questions and relative clauses should be bracketed as SBAR, not SBARQ.                                                                           |
| SINV   | - Inverted declarative sentence, i.e. one in which the subject follows the tensed verb or modal.                                                                                                                    |
| SQ     | Inverted yes/no question, or main clause of a wh-question, following the wh-phrase in SBARQ.                                                                                                                        |
| UCP    | Unlike Coordinated Phrase.                                                                                                                                                                                          |
| VP     | Verb Phrase.                                                                                                                                                                                                       |
| WHADJP | Wh-adjective Phrase. Adjectival phrase containing a wh-adverb, as in how hot.                                                                                                                                       |
| WHADVP | - Wh-adverb Phrase. Introduces a clause with an NP gap. May be null (containing the 0 complementizer) or lexical, containing a wh-adverb such as how or why.                                                        |
| WHNP   | - Wh-noun Phrase. Introduces a clause with an NP gap. May be null (containing the 0 complementizer) or lexical, containing some wh-word, e.g. who, which book, whose daughter, none of which, or how many leopards. |
| WHPP   | - Wh-prepositional Phrase. Prepositional phrase containing a wh-noun phrase (such as of which or by whose authority) that either introduces a PP gap or is contained by a WHNP.                                     |
| X      | - Unknown, uncertain, or unbracketable. X is often used for bracketing typos and in bracketing the…​the-constructions.                                                                                              |


================================================
FILE: docs/annotations/dep/index.md
================================================
# Dependency Parsing

## Chinese

```{toctree}
sd_zh
pmt
```

## English

```{toctree}
sd_en
```

## Multilingual

```{toctree}
ud
```


================================================
FILE: docs/annotations/dep/pmt.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# PKU Multi-view Chinese Treebank


```{eval-rst}

See also :cite:`qiu-etal-2014-multi`.
    
```

| Tag  | Description                                 | 依存关系       |
| ---- | ------------------------------------------- | -------------- |
| ACT  | action object                               | 行为宾语       |
| ADV  | adverbial                                   | 状语           |
| APP  | appositive element                          | 同位           |
| ATT  | attribute                                   | 定语           |
| CMP  | complement                                  | 补语           |
| COO  | other coordination element                  | 一般并列       |
| COS  | share-right-child coordination element      | 共享并列       |
| DE   | de (modifier of 的(special function word))  | 的字           |
| DEI  | dei (modifier of 得(special function word)) | 得字           |
| DI   | di (modifier of 地(special function word))  | 地字           |
| FOC  | focus                                       | 强调           |
| HED  | root of a sentence                          | 核心           |
| IC   | independent clause                          | 小句           |
| IOB  | indirect object                             | 间接宾语       |
| IS   | independent structure                       | 独立结构       |
| ISC  | non-shared independent structure            | 并列式独立结构 |
| LAD  | left additive                               | 前附加         |
| MT   | modality and time                           | 时体           |
| NUM  | number                                      | 数字           |
| POB  | propositional object                        | 介宾           |
| PUN  | punctuation                                 | 标点           |
| PUS  | cross-clause punctuation                    | 跨句标点       |
| QUC  | post-positional quantity                    | 数量补语       |
| QUCC | non-shared post-positional quantity         | 非共享数量补语 |
| QUN  | quantity                                    | 数量           |
| RAD  | right additive                              | 后附加         |
| RADC | non-shared right additive                   | 非共享后附加   |
| RED  | reduplicate element                         | 重叠           |
| SBV  | subject                                     | 主语           |
| TPC  | topic                                       | 话题           |
| VOB  | direct object                               | 宾语           |
| VV   | serial verb construction                    | 连动           |


================================================
FILE: docs/annotations/dep/sd_en.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# Stanford Dependencies English

See also [Stanford typed dependencies manual](https://nlp.stanford.edu/software/dependencies_manual.pdf).

| Tag        | Description                       |
|------------|-----------------------------------|
| abbrev     | abbreviation modifier             |
| acomp      | adjectival complement             |
| advcl      | adverbial clause modifier         |
| advmod     | adverbial modifier                |
| agent      | agent                             |
| amod       | adjectival modifier               |
| appos      | appositional modifier             |
| arg        | argument                          |
| attr       | attributive                       |
| aux        | auxiliary                         |
| auxpass    | passive auxiliary                 |
| cc         | coordination                      |
| ccomp      | clausal complement                |
| comp       | complement                        |
| complm     | complementizer                    |
| conj       | conjunct                          |
| cop        | copula                            |
| csubj      | clausal subject                   |
| csubjpass  | clausal passive subject           |
| dep        | dependent                         |
| det        | determiner                        |
| discourse  | discourse element                 |
| dobj       | direct object                     |
| expl       | expletive                         |
| goeswith   | goes with                         |
| iobj       | indirect object                   |
| mark       | marker                            |
| mod        | modifier                          |
| mwe        | multi-word expression             |
| neg        | negation modifier                 |
| nn         | noun compound modifier            |
| npadvmod   | noun phrase as adverbial modifier |
| nsubj      | nominal subject                   |
| nsubjpass  | passive nominal subject           |
| num        | numeric modifier                  |
| number     | element of compound number        |
| obj        | object                            |
| parataxis  | parataxis                         |
| pcomp      | prepositional complement          |
| pobj       | object of a preposition           |
| poss       | possession modifier               |
| possessive | possessive modifier               |
| preconj    | preconjunct                       |
| pred       | predicate                         |
| predet     | predeterminer                     |
| prep       | prepositional modifier            |
| prepc      | prepositional clausal modifier    |
| prt        | phrasal verb particle             |
| punct      | punctuation                       |
| purpcl     | purpose clause modifier           |
| quantmod   | quantifier phrase modifier        |
| rcmod      | relative clause modifier          |
| ref        | referent                          |
| rel        | relative                          |
| root       | root                              |
| sdep       | semantic dependent                |
| subj       | subject                           |
| tmod       | temporal modifier                 |
| vmod       | verb modifier                     |
| xcomp      | open clausal complement           |
| xsubj      | controlling subject               |

================================================
FILE: docs/annotations/dep/sd_zh.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# Stanford Dependencies Chinese


```{eval-rst}

See also :cite:`chang-etal-2009-discriminative`.
    
```

|Tag|Description|中文简称|例句|依存弧|
| ---- | ---- | ---- | ---- | ---- |
|nn|noun compound modifier|复合名词修饰|服务中心|nn(中心，服务）|
|punct|punctuation|标点符号|海关统计表明，|punct(表明，，)|
|nsubj|nominal subject|名词性主语|梅花盛开|nsubj (盛开，梅花）|
|conj|conjunct (links two conjuncts)|连接性状语|设备和原材料|conj(原材料，设备）|
|dobj|direct object|直接宾语|浦东颁布了七十一件文件|dobj(颁布，文件）|
|advmod|adverbial modifier|副词性状语|部门先送上文件|advmod(送上，先）|
|prep|prepositional modifier|介词性修饰语|在实践中逐步完善|prep(完善，在）|
|nummod|number modifier|数词修饰语|七十一件文件|nummod(件，七十一）|
|amod|adjectival modifier|形容词修饰语|跨世纪工程|amod(工程，跨世纪）|
|pobj|prepositional object|介词性宾语|根据有关规定|pobj (根据，规定）|
|rcmod|relative clause modifier|关系从句修饰语|不曾遇到过的情况|rcmod(情况，遇到）|
|cpm|complementizer|补语|开发浦东的经济活动|cpm(开发，的）|
|assm|associative marker|关联标记|企业的商品|assm(企业，的）|
|assmod|associative modifier|关联修饰|企业的商品|assmod(商品，企业）|
|cc|coordinating conjunction|并列关系|设备和原材料|cc(原材料，和）|
|clf|classifier modifier|类别修饰|七十一件文件|clf(文件，件）|
|ccomp|clausal complement|从句补充|银行决定先取得信用评级|ccomp(决定，取得）|
|det|determiner|限定语|这些经济活动|det(活动，这些）|
|lobj|localizer object|范围宾语|近年来|lobj(来，近年）|
|range|dative object that is a quantifier phrase|数量词间接宾语|成交药品一亿多元|range(成交，元）|
|asp|aspect marker|时态标记|发挥了作用|asp(发挥，了）|
|tmod|temporal modifier|时间修饰语|以前不曾遇到过|tmod(遇到，以前）|
|plmod|localizer modifier of a preposition|介词性地点修饰|在这片热土上|plmod(在，上）|
|attr|attributive|属性|贸易额为二百亿美元|attr(为，美元）|
|mmod|modal verb modifier|情态动词|利益能得到保障|mmod(得到，能）|
|loc|localizer|位置补语|占九成以上|loc(占，以上）|
|top|topic|主题|建筑是主要活动|top(是，建筑）|
|pccomp|clausal complement of a preposition|介词补语|据有关部门介绍|pccomp(据，介绍）|
|etc|etc modifier|省略关系|科技、文教等领域|etc(文教，等）|
|lccomp|clausal complement of a localizer|位置补语|中国对外开放中升起的明星|lccomp(中，开放）|
|ordmod|ordinal number modifier|量词修饰|第七个机构|ordmod(个，第七）|
|xsubj|controlling subject|控制主语|银行决定先取得信用评级|xsubj (取得，银行）|
|neg|negative modifier|否定修饰|以前不曾遇到过|neg(遇到，不）|
|rcomp|resultative complement|结果补语|研究成功|rcomp(研究，成功）|
|comod|coordinated verb compound modifier|并列联合动词|颁布实行|comod(颁布，实行）|
|vmod|verb modifier|动词修饰|其在支持外商企业方面的作用|vmod(方面，支持）|
|prtmod|particles such as 所，以，来，而|小品词|在产业化所取得的成就|prtmod(取得，所）|
|ba|“ba” construction|把字关系|把注意力转向市场|ba(转向，把）|
|dvpm|manner DE(地）modifier|地字修饰|有效地防止流失|dvpm(有效，地）|
|dvpmod|a "XP+DEV", phrase that modifies VP|地字动词短语|有效地防止流失|dvpmod(防止，有效）|
|prnmod|parenthetical modifier|插入词修饰|八五期间（1990-1995 )|pmmod(期间，1995)|
|cop|copular|系动词|原是自给自足的经济|cop(自给自足，是）|
|pass|passive marker|被动标记|被认定为高技术产业|pass(认定，被）|
|nsubjpass|nominal passive subject|被动名词主语|镍被称作现代工业的维生素|nsubjpass(称作，镍）|
|dep|dependent|其他依赖关系|新华社北京二月十二日电|dep(电，新华社）|


================================================
FILE: docs/annotations/dep/ud.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# Universal Dependencies

## Cross-Linguistic

See also [Universal Dependencies](https://universaldependencies.org/docs/u/dep/index.html).

| Tag        | Description                                  |
|------------|----------------------------------------------|
| acl        | clausal modifier of noun (adjectival clause) |
| advcl      | adverbial clause modifier                    |
| advmod     | adverbial modifier                           |
| amod       | adjectival modifier                          |
| appos      | appositional modifier                        |
| aux        | auxiliary                                    |
| auxpass    | passive auxiliary                            |
| case       | case marking                                 |
| cc         | coordinating conjunction                     |
| ccomp      | clausal complement                           |
| compound   | compound                                     |
| conj       | conjunct                                     |
| cop        | copula                                       |
| csubj      | clausal subject                              |
| csubjpass  | clausal passive subject                      |
| dep        | unspecified dependency                       |
| det        | determiner                                   |
| discourse  | discourse element                            |
| dislocated | dislocated elements                          |
| dobj       | direct object                                |
| expl       | expletive                                    |
| foreign    | foreign words                                |
| goeswith   | goes with                                    |
| iobj       | indirect object                              |
| list       | list                                         |
| mark       | marker                                       |
| mwe        | multi-word expression                        |
| name       | name                                         |
| neg        | negation modifier                            |
| nmod       | nominal modifier                             |
| nsubj      | nominal subject                              |
| nsubjpass  | passive nominal subject                      |
| nummod     | numeric modifier                             |
| parataxis  | parataxis                                    |
| punct      | punctuation                                  |
| remnant    | remnant in ellipsis                          |
| reparandum | overridden disfluency                        |
| root       | root                                         |
| vocative   | vocative                                     |
| xcomp      | open clausal complement                      |


## Localization

### Chinese

| Tag              |       简称 |                                                         例句 |
| :--------------- |---------:| -----------------------------------------------------------: |
| acl              |    形容词子句 | ![acl](https://file.hankcs.com/img/ud/1303b5cbe9413044cb800b3c3514b70b.svg) |
| advcl:loc        |  状语从句修饰语 | ![advcl:loc](https://file.hankcs.com/img/ud/e8865563caf0eda7a80043eda8cc43a6.svg) |
| advmod           |       状语 | ![advmod](https://file.hankcs.com/img/ud/3ce9276f4e18d92edb48e58956bbaee7.svg) |
| advmod:dvp       |     状语:地 | ![advmod:dvp](https://file.hankcs.com/img/ud/e90870682b9f0a80736d25977565f96a.svg) |
| advmod:loc       |    状语:限定 | ![advmod:loc](https://file.hankcs.com/img/ud/135e9143e73e5f45290d204d4ad5b30e.svg) |
| advmod:rcomp     |    状语:因果 | ![advmod:rcomp](https://file.hankcs.com/img/ud/aa75be342648bed0846f54a88f71e7a7.svg) |
| amod             |       形容 | ![amod](https://file.hankcs.com/img/ud/dee0097c244c1bd0a1d1ed117932346d.svg) |
| amod:ordmod      |    形容:数量 | ![amod:ordmod](https://file.hankcs.com/img/ud/8bb79245311a4190836dce8439591e91.svg) |
| appos            |       同位 | ![appos](https://file.hankcs.com/img/ud/a74f6a31f68ba5697d0a8906e8476b47.svg) |
| aux:asp          |    助语:时态 | ![aux:asp](https://file.hankcs.com/img/ud/8c32de9b4858c0e4d24ee6da5fb80a6e.svg) |
| aux:ba           |     助语:把 | ![aux:ba](https://file.hankcs.com/img/ud/2c712e3af49fcdbd5914398895904f3c.svg) |
| aux:modal        |    助语:情态 | ![aux:modal](https://file.hankcs.com/img/ud/606946c569e4bfbacbb1b9e13336e247.svg) |
| aux:prtmod       |    助语:分词 | ![aux:prtmod](https://file.hankcs.com/img/ud/fc49d338487dd63687941433a0633f5d.svg) |
| auxpass          |       被动 | ![auxpass](https://file.hankcs.com/img/ud/a6e4a8aabb7bb1bb5c4e9cdf7876e3f7.svg) |
| case             |       条件 | ![case](https://file.hankcs.com/img/ud/35a021e15a9355880cb8720ba34ed936.svg) |
| cc               |     并列连词 | ![cc](https://file.hankcs.com/img/ud/18c6a22520cec2ba60ce636bb410f651.svg) |
| ccomp            |     从句补语 | ![ccomp](https://file.hankcs.com/img/ud/8cc4ea0c6a090f1ba03d02926240c35b.svg) |
| compound:nn      |     复合名词 | ![compound:nn](https://file.hankcs.com/img/ud/587e12141aa42aa9862ea0ac0eb30e09.svg) |
| compound:vc      |     复合动词 | ![compound:vc](https://file.hankcs.com/img/ud/f72cedcb6cec8563d88063b118544a9d.svg) |
| conj             |       连接 | ![conj](https://file.hankcs.com/img/ud/fc924f495d1d5a3a828a0e2262da06cd.svg) |
| cop              |       系动 | ![cop](https://file.hankcs.com/img/ud/a7da58f57adbe9e6bd166ecb514f2d1c.svg) |
| csubj            |     从句主语 | ![csubj](https://file.hankcs.com/img/ud/0adda481e81b3765ed7f4f9d55c153c4.svg) |
| dep              |      未定义 | ![dep](https://file.hankcs.com/img/ud/db15b792f1bfd5e42982832b04c65a79.svg) |
| det              |       限定 | ![det](https://file.hankcs.com/img/ud/17376d13a4e7b0677cd18d13e0990dab.svg) |
| discourse        |       语气 | ![discourse](https://file.hankcs.com/img/ud/d7eb37d5fd13462b237140a08f0ed9a4.svg) |
| dobj             |     直接宾语 | ![dobj](https://file.hankcs.com/img/ud/f5e801103ddc57a9aeff0e272b8f7b44.svg) |
| etc              |       省略 | ![etc](https://file.hankcs.com/img/ud/86d3fd24cae9f585b7730119edaa0248.svg) |
| mark             |       标记 | ![mark](https://file.hankcs.com/img/ud/b17b4027ab368c76a3b6f085d5b561d9.svg) |
| mark:clf         |    标记:量词 | ![mark:clf](https://file.hankcs.com/img/ud/5974c92e3587aa64ba1d572243b9c5cc.svg) |
| name             |       名称 | ![name](https://file.hankcs.com/img/ud/63ea082457dfe6f4fc04f635a8c019f3.svg) |
| neg              |       否定 | ![neg](https://file.hankcs.com/img/ud/e38814231ff9a31dcce5672556375c94.svg) |
| nmod             |     名词修饰 | ![nmod](https://file.hankcs.com/img/ud/e948a8dbcd43984d14c257f0ace1753d.svg) |
| nmod:assmod      |  名词修饰:关联 | ![nmod:assmod](https://file.hankcs.com/img/ud/76349f30cef2c4978a03118d65ac6c81.svg) |
| nmod:poss        | 名词修饰:所有格 | ![nmod:poss](https://file.hankcs.com/img/ud/5b4937dbea42cdff7054e9dd0904bedb.svg) |
| nmod:prep        |  名词修饰:介词 | ![nmod:prep](https://file.hankcs.com/img/ud/63b92981638b758681a82e9f4a9aa04c.svg) |
| nmod:range       |  名词修饰:范围 | ![nmod:range](https://file.hankcs.com/img/ud/217ec98756cfe3750c76f5e5e89b7f54.svg) |
| nmod:tmod        |  名词修饰:时间 | ![nmod:tmod](https://file.hankcs.com/img/ud/166e3b8fb72db52f0ec332d444ea017f.svg) |
| nmod:topic       |  名词修饰:主题 | ![nmod:topic](https://file.hankcs.com/img/ud/93c83c98c188b131211ac5e9ff5242c0.svg) |
| nsubj            |     名词主语 | ![nsubj](https://file.hankcs.com/img/ud/63e3902d4a3045d1d696a0c4ed203563.svg) |
| nsubj:xsubj      | 名词主语: 补语 | ![nsubj:xsubj](https://file.hankcs.com/img/ud/80cb355b9f9732fd888186a1f658b0ac.svg) |
| nsubjpass        |    被动态主语 | ![nsubjpass](https://file.hankcs.com/img/ud/6327fab58ab42d5a417b2e5c7018ac3a.svg) |
| nummod           |       数量 | ![nummod](https://file.hankcs.com/img/ud/0fd20559645265c2c937f06631aa74df.svg) |
| parataxis:prnmod |       并列 | ![parataxis:prnmod](https://file.hankcs.com/img/ud/783a0faf4cd935bb61f5d225a388b79e.svg) |
| punct            |     标点符号 | ![punct](https://file.hankcs.com/img/ud/983410055658352080ae476a5d85e6b5.svg) |
| root             |        根 | ![root](https://file.hankcs.com/img/ud/588101bec0440ffb769172f8b7e9f98e.svg) |
| xcomp            |     从句补语 | ![xcomp](https://file.hankcs.com/img/ud/c72071875f1c01e51acb9e1ec4893113.svg) |


================================================
FILE: docs/annotations/index.md
================================================
# Annotations


```{toctree}
tok/index
pos/index
ner/index
dep/index
sdp/index
srl/index
constituency/index
```


================================================
FILE: docs/annotations/ner/index.md
================================================
# Named Entity Recognition

## Chinese

```{toctree}
pku
msra
```

## Multilingual

```{toctree}
ontonotes
```


================================================
FILE: docs/annotations/ner/msra.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# msra

| Category | Subcategory    | Tag-set of Format-1 | Tag-set of Format-2 |
|----------|----------------|---------------------|---------------------|
| NAMEX    | Person         | P                   | PERSON              |
|          | Location       | L                   | LOCATION            |
|          | Organization   | 〇                   | ORGANIZATION        |
| TIMEX    | Date           | dat                 | DATE                |
|          | Duration       | dur                 | DURATION            |
|          | Time           | tim                 | TIME                |
| NUMEX    | Percent        | per                 | PERCENT             |
|          | Money          | mon                 | MONEY               |
|          | Frequency      | fre                 | FREQUENCY           |
|          | Integer        | int                 | INTEGER             |
|          | Fraction       | fra                 | FRACTION            |
|          | Decimal        | dec                 | DECIMAL             |
|          | Ordinal        | ord                 | ORDINAL             |
|          | Rate           | rat                 | RATE                |
| MEASUREX | Age            | age                 | AGE                 |
|          | Weight         | wei                 | WEIGHT              |
|          | Length         | len                 | LENGTH              |
|          | Temperature    | tem                 | TEMPERATURE         |
|          | Angle          | ang                 | ANGLE               |
|          | Area           | are                 | AREA                |
|          | Capacity       | cap                 | CAPACITY            |
|          | Speed          | spe                 | SPEED               |
|          | Acceleration   | acc                 | ACCELERATION        |
|          | Other measures | mea                 | MEASURE             |
| ADDREX   | Email          | ema                 | EMAIL               |
|          | Phone          | pho                 | PHONE               |
|          | Fax            | fax                 | FAX                 |
|          | Telex          | tel                 | TELEX               |
|          | WWW            | WWW                 | WWW                 |
|          | Postalcode     | pos                 | POSTALCODE          |


================================================
FILE: docs/annotations/ner/ontonotes.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# ontonotes

| TAG       | Description                        |
|--------------|------------------------------------------------------|
| PERSON       | People, including fictional                          |
| NORP         | Nationalities or religious or political groups       |
| FACILITY     | Buildings, airports, highways, bridges, etc.         |
| ORGANIZATION | Companies, agencies, institutions, etc.              |
| GPE          | Countries, cities, states                            |
| LOCATION     | Non-GPE locations, mountain ranges, bodies of water  |
| PRODUCT      | Vehicles, weapons, foods, etc. (Not services)        |
| EVENT        | Named hurricanes, battles, wars, sports events, etc. |
| WORK OF ART  | Titles of books, songs, etc.                         |
| LAW          | Named documents made into laws                       |
| DATE     | Absolute or relative dates or periods        |
| TIME     | Times smaller than a day                     |
| PERCENT  | Percentage                        |
| MONEY    | Monetary values, including unit              |
| QUANTITY | Measurements, as of weight or distance       |
| ORDINAL  | “first”, “second”                             |
| CARDINAL | Numerals that do not fall under another type |


================================================
FILE: docs/annotations/ner/pku.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# pku

| 序号 | 词性 | 名称     | 帮助记忆的诠释                                         | 例子及注解                                                   |
| ---- | ---- | -------- | ------------------------------------------------------ | ------------------------------------------------------------ |
| 1   | nr   | 人名     | 名词代码n和“人(ren)”的声母并在一起。                   | 1. 汉族人及与汉族起名方式相同的非汉族人的姓和名单独切分，并分别标注为nr。张/nr 仁伟/nr， 欧阳/nr 修/nr， 阮/nr 志雄/nr， 朴/nr 贞爱/nr汉族人除有单姓和复姓外，还有双姓，即有的女子出嫁后，在原来的姓上加上丈夫的姓。如：陈方安生。这种情况切分、标注为：陈/nr 方/nr 安生/nr；唐姜氏，切分、标注为：唐/nr 姜氏/nr。2. 姓名后的职务、职称或称呼要分开。江/nr 主席/n， 小平/nr 同志/n， 江/nr 总书记/n，张/nr 教授/n， 王/nr 部长/n， 陈/nr 老总/n， 李/nr 大娘/n， 刘/nr 阿姨/n， 龙/nr 姑姑/n3. 对人的简称、尊称等若为两个字，则合为一个切分单位，并标以nr。老张/nr， 大李/nr， 小郝/nr， 郭老/nr， 陈总/nr4. 明显带排行的亲属称谓要切分开，分不清楚的则不切开。三/m 哥/n， 大婶/n， 大/a 女儿/n， 大哥/n， 小弟/n， 老爸/n5. 一些著名作者的或不易区分姓和名的笔名通常作为一个切分单位。鲁迅/nr， 茅盾/nr， 巴金/nr， 三毛/nr， 琼瑶/nr， 白桦/nr6. 外国人或少数民族的译名（包括日本人的姓名）不予切分，标注为nr。克林顿/nr， 叶利钦/nr， 才旦卓玛/nr， 小林多喜二/nr， 北研二/nr，华盛顿/nr， 爱因斯坦/nr有些西方人的姓名中有小圆点，也不分开。卡尔·马克思/nr |
| 2   | ns   | 地名     | 名词代码n和处所词代码s并在一起。                       | 安徽/ns，深圳/ns，杭州/ns，拉萨/ns，哈尔滨/ns， 呼和浩特/ns， 乌鲁木齐/ns，长江/ns，黄海/ns，太平洋/ns， 泰山/ns， 华山/ns，亚洲/ns， 海南岛/ns，太湖/ns，白洋淀/ns， 俄罗斯/ns，哈萨克斯坦/ns，彼得堡/ns， 伏尔加格勒/ns 1. 国名不论长短，作为一个切分单位。中国/ns， 中华人民共和国/ns， 日本国/ns， 美利坚合众国/ns， 美国/ns2. 地名后有“省”、“市”、“县”、“区”、“乡”、“镇”、“村”、“旗”、“州”、“都”、“府”、“道”等单字的行政区划名称时，不切分开，作为一个切分单位。四川省/ns， 天津市/ns，景德镇/ns沙市市/ns， 牡丹江市/ns，正定县/ns，海淀区/ns， 通州区/ns，东升乡/ns， 双桥镇/ns 南化村/ns，华盛顿州/ns，俄亥俄州/ns，东京都/ns， 大阪府/ns，北海道/ns， 长野县/ns，开封府/ns，宣城县/ns3. 地名后的行政区划有两个以上的汉字，则将地名同行政区划名称切开，不过要将地名同行政区划名称用方括号括起来，并标以短语NS。[芜湖/ns 专区/n] NS，[宣城/ns 地区/n]ns，[内蒙古/ns 自治区/n]NS，[深圳/ns 特区/n]NS， [厦门/ns 经济/n 特区/n]NS， [香港/ns 特别/a 行政区/n]NS，[香港/ns 特区/n]NS， [华盛顿/ns 特区/n]NS，4. 地名后有表示地形地貌的一个字的普通名词，如“江、河、山、洋、海、岛、峰、湖”等，不予切分。鸭绿江/ns，亚马逊河/ns， 喜马拉雅山/ns， 珠穆朗玛峰/ns，地中海/ns，大西洋/ns，洞庭湖/ns， 塞普路斯岛/ns 5. 地名后接的表示地形地貌的普通名词若有两个以上汉字，则应切开。然后将地名同该普通名词标成短语NS。[台湾/ns 海峡/n]NS，[华北/ns 平原/n]NS，[帕米尔/ns 高原/n]NS， [南沙/ns 群岛/n]NS，[京东/ns 大/a 峡谷/n]NS [横断/b 山脉/n]NS6．地名后有表示自然区划的一个字的普通名词，如“ 街，路，道，巷，里，町，庄，村，弄，堡”等，不予切分。 中关村/ns，长安街/ns，学院路/ns， 景德镇/ns， 吴家堡/ns， 庞各庄/ns， 三元里/ns，彼得堡/ns， 北菜市巷/ns， 7．地名后接的表示自然区划的普通名词若有两个以上汉字，则应切开。然后将地名同自然区划名词标成短语NS。[米市/ns 大街/n]NS， [蒋家/nz 胡同/n]NS ， [陶然亭/ns 公园/n]NS ， 8． 大小地名相连时的标注方式为：北京市/ns 海淀区/ns 海淀镇/ns [南/f 大街/n]NS [蒋家/nz 胡同/n]NS 24/m 号/q ， |
| 3   | nt   | 机构团体 | “团”的声母为t，名词代码n和t并在一起。                  | （参见2。短语标记说明--NT）联合国/nt，中共中央/nt，国务院/nt， 北京大学/nt1．大多数团体、机构、组织的专有名称一般是短语型的，较长，且含有地名或人名等专名，再组合，标注为短语NT。[中国/ns 计算机/n 学会/n]NT， [香港/ns 钟表业/n 总会/n]NT， [烟台/ns 大学/n]NT， [香港/ns 理工大学/n]NT， [华东/ns 理工大学/n]NT，[合肥/ns 师范/n 学院/n]NT， [北京/ns 图书馆/n]NT， [富士通/nz 株式会社/n]NT， [香山/ns 植物园/n]NT， [安娜/nz 美容院/n]NT，[上海/ns 手表/n 厂/n]NT， [永和/nz 烧饼铺/n]NT，[北京/ns 国安/nz 队/n]NT，2. 对于在国际或中国范围内的知名的唯一的团体、机构、组织的名称即使前面没有专名，也标为nt或NT。联合国/nt，国务院/nt，外交部/nt， 财政部/nt，教育部/nt， 国防部/nt，[世界/n 贸易/n 组织/n]NT， [国家/n 教育/vn 委员会/n]NT，[信息/n 产业/n 部/n]NT，[全国/n 信息/n 技术/n 标准化/vn 委员会/n]NT，[全国/n 总/b 工会/n]NT，[全国/n 人民/n 代表/n 大会/n]NT，美国的“国务院”，其他国家的“外交部、财政部、教育部”，必须在其所属国的国名之后出现时，才联合标注为NT。[美国/ns 国务院/n]NT，[法国/ns 外交部/n]NT，[美/j 国会/n]NT，日本有些政府机构名称很特别，无论是否出现在“日本”国名之后都标为nt。[日本/ns 外务省/nt]NT，[日/j 通产省/nt]NT通产省/nt 3. 前后相连有上下位关系的团体机构组织名称的处理方式如下:[联合国/nt 教科文/j 组织/n]NT， [中国/ns 银行/n 北京/ns 分行/n]NT，[河北省/ns 正定县/ns 西平乐乡/ns 南化村/ns 党支部/n]NT， 当下位名称含有专名（如“北京/ns 分行/n”、“南化村/ns 党支部/n”、“昌平/ns 分校/n”）时，也可脱离前面的上位名称单独标注为NT。[中国/ns 银行/n]NT [北京/ns 分行/n]NT，北京大学/nt [昌平/ns 分校/n]NT，4. 团体、机构、组织名称中用圆括号加注简称时:[宝山/ns 钢铁/n （/w 宝钢/j ）/w 总/b 公司/n]NT，[宝山/ns 钢铁/n 总/b 公司/n]NT，（/w 宝钢/j ）/w |

================================================
FILE: docs/annotations/pos/863.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# 863

| 词性  |   名称   |              说明              |                                                                                                                                                                                                                                    例子                                                                                                                                                                                                                                    |
| :-- | -----: | ---------------------------: | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| a   |    形容词 |        取英语形容词adjective的第1个字母 |                                                                                                                                                                                                                                                                                                                                                                                                                                         [重要/a 步伐/n]NP ，美丽/a ，看似/v 抽象/a ， |
| c   |     连词 |      取英语连词conjunction的第1个字母。 |                                                                                                                                                                                                                                                                                                                                                                                                                                                           合作/vn 与/c 伙伴/n |
| d   |     副词 | 取adverb的第2个字母，因其第1个字母已用于形容词。 |                                                                                                                                                                                                                                                                                                                                                                                                                                                             进一步/d 发展/v ， |
| e   |     叹词 |      取英语叹词exclamation的第1个字母。 |                                                                                                                                                                                                                                                                                                                                                                                                                                             啊/e ，/w 那/r 金灿灿/z 的/u 麦穗/n ， |
| f   |    方位词 |                      取汉字“方”。 |                                                                                                                                                                                                                                                                                                                                                                                                                                    军人/n 的/u 眼睛/n 里/f 不/d 是/v 没有/v 风景/n ， |
| g   |    语素字 |                              |                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
| h   |   前接成分 |               取英语head的第1个字母。 |                                                                                                                                                                                                                                                                                                                                                                                                          许多/m 非/h 主角/n 人物/n ，办事处/n 的/u “/w 准/h 政府/n ”/w 功能/n 不断/d 加强/v ， |
| i   |     成语 |            取英语成语idiom的第1个字母。 |                                                                                                                                                                                                                                                                                                                                                                                                                                                         一言一行/i ，义无反顾/i ， |
| j   |   简称略语 |                   取汉字“简”的声母。 |                                                                                                                                                                                                                                                                                                                                                                                                                                                     [德/j 外长/n]NP ，文教/j ， |
| k   |   后接成分 |                        后接成分。 |                                                                                                                                                                                                                                                                                                                                                                                                                                         少年儿童/l 朋友/n 们/k ，身体/n 健康/a 者/k ， |
| m   |     数词 |    取英语numeral的第3个字母，n，u已有他用。 | 1．数量词组应切分为数词和量词。 三/m 个/q， 10/m 公斤/q， 一/m 盒/q 点心/n ，但少数数量词已是词典的登录单位，则不再切分。 一个/m ， 一些/m ，2. 基数、序数、小数、分数、百分数一律不予切分，为一个切分单位，标注为 m 。一百二十三/m，20万/m， 123.54/m， 一个/m， 第一/m， 第三十五/m， 20%/m， 三分之二/m， 千分之三十/m， 几十/m 人/n， 十几万/m 元/q， 第一百零一/m 个/q ，3. 约数，前加副词、形容词或后加“来、多、左右”等助数词的应予分开。约/d 一百/m 多/m 万/m，仅/d 一百/m 个/q， 四十/m 来/m 个/q，二十/m 余/m 只/q， 十几/m 个/q，三十/m 左右/m ，两个数词相连的及“成百”、“上千”等则不予切分。五六/m 年/q， 七八/m 天/q，十七八/m 岁/q， 成百/m 学生/n，上千/m 人/n， 4．表序关系的“数＋名”结构，应予切分。二/m 连/n ，　三/m 部/n ， |
| mq  |    数量词 |                              |                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
| n   |     名词 |             取英语名词noun的第1个字母。 |                                                                                                                                                                                                                                                                                                                                                                                                                        （参见 动词--v）岗位/n ， 城市/n ， 机会/n ，她/r 是/v 责任/n 编辑/n ， |
| nd  |   方位名词 |           方位名词（nd），表示位置的相对方向 |                                                                                                                                                                                                                                                                                                                                                                                                                  上  下  左  右  前  后  里  外  中  东  西  南  北前边  左面  里头  中间  外部 |
| nh  |     人名 |           人名（nh），表示人的名称的专有名词 |                                                                                                                                                                                                                                                                                                                                                                                                                                        华罗庚  阿凡提  诸葛亮  司马相如  松赞干布  卡尔·马克思 |
| nhf |      姓 |                              |                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
| nhs |      名 |                              |                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
| ni  |    机构名 |    机构名（ni），表示团体、组织、机构名称的专有名词 |                                                                                                                                                                                                                                                                                                                                                                                                                                                    联合国  教育部  北京大学  中国科学院 |
| nl  |   处所名词 |                处所名词（nl），表示处所 |                                                                                                                                                                                                                                                                                                                                                                                                                                           空中  高处  隔壁  门口  附近  边疆  一旁  野外 |
| ns  |     地名 |         地名（ns），表示地理区域名称的专有名词 |                                                                                                                                                                                                                                                                                                                                                                                                                       亚洲  大西洋  地中海  阿尔卑斯山  加拿大中国  北京  浙江  景德镇  呼和浩特  中关村 |
| nt  |   时间名词 |          时间名词（nt），包括一般所说的时量词 |                                                                                                                                                                                                                                                                                                                                                                                                                                 年  月  日  分  秒现在  过去  昨天  去年  将来  宋朝  星期一 |
| nz  | 其他专有名词 |                   其他专有名词（nz） |                                                                                                                                                                                                                                                                                                                                                                                                                                                           五粮液  宫爆鸡丁  桑塔纳 |
| o   |    拟声词 |    取英语拟声词onomatopoeia的第1个字母。 |                                                                                                                                                                                                                                                                                                                                                                                                                                          哈哈/o 一/m 笑/v ，装载机/n 隆隆/o 推进/v ， |
| p   |     介词 |    取英语介词prepositional的第1个字母。 |                                                                                                                                                                                                                                                                                                                                                                                对/p 子孙后代/n 负责/v ，以/p 煤/n 养/v 农/Ng ，为/p 治理/v 荒山/n 服务/v ， 把/p 青年/n 推/v 上/v 了/u 领导/vn 岗位/n ， |
| q   |     量词 |           取英语quantity的第1个字母。 |                                                                                                                                                                                                                                                                                                                                                                                                                                                （参见数词m）首/m 批/q ，一/m 年/q ， |
| r   |     代词 |  取英语代词pronoun的第2个字母，因p已用于介词。 |                                                                                                                                                                                                                                                                                                                                                                           单音节代词“本”、“每”、“各”、“诸”后接单音节名词时，和后接的单音节名词合为代词；当后接双音节名词时，应予切分。本报/r， 每人/r， 本社/r， 本/r 地区/n， 各/r 部门/n |
| u   |     助词 |              取英语助词auxiliary。 |                                                                                                                                                                                                                                                                                                                                                                   [[俄罗斯/ns 和/c 北约/j]NP-BL 之间/f [战略/n 伙伴/n 关系/n]NP 的/u 建立/vn]NP 填平/v 了/u [[欧洲/ns 安全/a 政治/n]NP 的/u 鸿沟/n]NP |
| v   |     动词 |             取英语动词verb的第一个字母。 |                                                                                                                                                                                                                                                                                                                （参见 名词--n）[[[欧盟/j 扩大/v]S 的/u [历史性/n 决定/n]NP]NP 和/c [北约/j 开放/v]S]NP-BL [为/p [创建/v [一/m 种/q 新/a 的/u 欧洲/ns 安全/a 格局/n]NP]VP-SBI]PP-MD [奠定/v 了/u 基础/n]V-SBI ，， |
| vd  |   趋向动词 |                趋向动词（vd），表示趋向 |                                                                                                                                                                                                                                                                                                                                                                                                                      （走）上   （趴）下   （进）来   （回）去（跑）上来  （掉）下去  （提）起来  （扔）过去 |
| vl  |   联系动词 |             联系动词（vl），表示关系的判断 |                                                                                                                                                                                                                                                                                                                                                                                                                                                                        是 |
| vu  |   能愿动词 |             能愿动词（vu），表示可能、意愿 |                                                                                                                                                                                                                                                                                                                                                                                                                                             能够  能  应该  可以  可能  情愿  愿意  要 |
| w   |   标点符号 |                              |                                                                                                                                                                                                                                                                                                                                                                                                                                                                  ”/w ：/w |
| ws  | 非汉字字符串 |                非汉字字符串（ws），如： |                                                                                                                                                                                                                                                                                                                                                                                                                                                    HanLP office  windows |
| x   |   非语素字 |  非语素字只是一个符号，字母x通常用于代表未知数、符号。 | 


================================================
FILE: docs/annotations/pos/ctb.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# ctb

 See also [The Part-Of-Speech Tagging Guidelines for the Penn Chinese Treebank (3.0)](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1039&context=ircs_reports).

| Tag  | Description                                          | Chinese | Chinese Description                                                      | Examples              |
|-----|------------------------------------------------------|---------|---------------------------------------------------------|------------------------|
| AD  | adverb                                               | 副词      | 副词                                                      | 仍然、很、大大、约              |
| AS  | aspect marker                                        | 动态助词    | 助词                                                      | 了、着、过                  |
| BA  | `bǎ` in ba-construction                              | 把字句     | 当“把”、“将”出现在结构“NP0 + BA + NP1+VP”时的词性                    | 把、将                    |
| CC  | coordinating conjunction                             | 并列连接词   | 并列连词                                                    | 与、和、或者、还是              |
| CD  | cardinal number                                      | 概数词     | 数词或表达数量的词                                               | 一百、好些、若干               |
| CS  | subordinating conjunction                            | 从属连词    | 从属连词                                                    | 如果、那么、就                |
| DEC | `de` as a complementizer or a nominalizer            | 补语成分“的” | 当“的”或“之”作补语标记或名词化标记时的词性，其结构为：S/VP DEC {NP}，如，喜欢旅游的大学生   | 的、之                    |
| DEG | `de` as a genitive marker and an associative marker  | 属格“的”   | 当“的”或“之”作所有格时的词性，其结构为:NP/PP/JJ/DT DEG {NP}， 如，他的车、经济的发展 | 的、之                    |
| DER | resultative `de`, `de` in V-de const and V-de-R      | 表结果的“得” | 当“得”出现在结构“V-得-R”时的词性，如，他跑得很快                            | 得                      |
| DEV | manner `de`, `de` before VP                          | 表方式的“地” | 当“地”出现在结构“X-地-VP”时的词性，如，高兴地说                            | 地                      |
| DT  | determiner                                           | 限定词     | 代冠词，通常用来修饰名词                                            | 这、那、该、每、各              |
| ETC | for words like "etc."                                | 表示省略    | “等”、“等等”的词性                                             | 等、等等              |
| EM  | emoji                                                | 表情符     | 表情符、或称颜文字                                      | ：）             |
| FW  | foreign words                                        | 外来语     | 外来词                                                     | 卡拉、A型                  |
| IC  | incomplete component                                 | 不完整成分   | 不完整成分，尤指ASR导致的错误                         | 好*xin*、那个*ba*  |
| IJ  | interjection                                         | 句首感叹词   | 感叹词，通常出现在句子首部                                           | 啊                      |
| JJ  | other noun-modifier                                  | 其他名词修饰语 | 形容词                                                     | 共同、新                   |
| LB  | `bèi` in long bei-const                              | 长句式表被动  | 当“被”、“叫”、“给”出现在结构“NP0 + LB + NP1+ VP”结构时 的词性，如，他被我训了一顿  | 被、叫、给                  |
| LC  | localizer                                            | 方位词     | 方位词以及表示范围的限定词                                                     | 前、旁、到、在内、以来、为止               |
| M   | measure word                                         | 量词      | 量词                                                      | 个、群、公里                 |
| MSP | other particle                                       | 其他小品词   | 其他虚词，包括“所”、“以”、“来”和“而”等出现在VP前的词                         | 所、以、来、而                |
| NN  | common noun                                          | 其他名词    | 除专有名词和时间名词外的所有名词                                        | 桌子、生活、经济               |
| NOI | noise that characters are written in the wrong order | 噪声      | 汉字顺序颠倒产生的噪声                    | 事/NOI 类/NOI 各/NOI 故/NOI |
| NR  | proper noun                                          | 专有名词    | 专有名词，通常表示地名、人名、机构名等                                     | 北京、乔丹、微软               |
| NT  | temporal noun                                        | 时间名词    | 表示时间概念的名词                                               | 一月、汉朝、当今               |
| OD  | ordinal number                                       | 序数词     | 序列词                                                     | 第一百                    |
| ON  | onomatopoeia                                         | 象声词     | 象声词                                                     | 哗哗、呼、咯吱              |
| P   | preposition e.g., "from" and "to"                    | 介词      | 介词                                                      | 从、对、根据                 |
| PN  | pronoun                                              | 代词      | 代词，通常用来指代名词                                             | 我、这些、其、自己              |
| PU  | punctuation                                          | 标点符号    | 标点符号                                                    | ?、。、；                  |
| SB  | `bèi` in short bei-const                             | 短句式表被动  | 当“被”、“给”出现在NP0 +SB+ VP结果时的词性，如，他被训了 一顿                  | 被、叫                    |
| SP  | sentence final particle                              | 句末助词    | 经常出现在句尾的词                                               | 吧、呢、啊、啊                |
| URL | web address                                          | 网址      | 网址                                                      | www.hankcs.com         |
| VA  | predicative adjective                                | 表语形容词   | 可以接在“很”后面的形容词谓语                                         | 雪白、厉害                  |
| VC  | copula, be words                                     | 系动词     | 系动词，表示“是”或“非”概念的动词                                       | 是、为、非                  |
| VE  | `yǒu` as the main verb                               | 动词有无    | 表示“有”或“无”概念的动词                                          | 有、没有、无                 |
| VV  | other verb                                           | 其他动词    | 其他普通动词，包括情态词、控制动词、动作动词、心理动词等等                           | 可能、要、走、喜欢              |


================================================
FILE: docs/annotations/pos/index.md
================================================
# Part-of-Speech Tagging

## Chinese
```{toctree}
ctb
pku
863
```

## Japanese
```{toctree}
npcmj
```

## Multilingual

```{toctree}
ud
```


================================================
FILE: docs/annotations/pos/npcmj.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# NPCMJ


| Tag       | Description                       |
|-----------|-----------------------------------|
| ADJI      | イ-adjective                      |
| ADJI-MD   | modal イ-adjective                |
| ADJN      | ナ-adjective                      |
| ADJN-MD   | modal ナ-adjective                |
| ADV       | adverb                            |
| AX        | auxiliary verb (including copula) |
| AXD       | auxiliary verb, past tense        |
| CL        | classifier                        |
| CONJ      | coordinating conjunction          |
| D         | determiner                        |
| FN        | formal noun                       |
| FW        | foreign word                      |
| INTJ      | interjection                      |
| MD        | modal element                     |
| N         | noun                              |
| N-MENTION | mentioned expression              |
| NEG       | negation                          |
| NPR       | proper noun                       |
| NUM       | numeral                           |
| P-COMP    | complementizer particle           |
| P-CONN    | conjunctional particle            |
| P-FINAL   | final particle                    |
| P-INTJ    | interjectional particle           |
| P-OPTR    | toritate particle                 |
| P-ROLE    | role particle                     |
| PASS      | direct passive                    |
| PASS2     | indirect passive                  |
| PNL       | prenominal                        |
| PRO       | pronoun                           |
| PU        | punctuation                       |
| PUL       | left bracket                      |
| PUR       | right bracket                     |
| Q         | quantifier                        |
| QUOT      | quote                             |
| SYM       | symbol                            |
| VB        | verb (or verb stem)               |
| VB0       | light verb                        |
| VB2       | secondary verb                    |
| WADV      | indeterminate adverb              |
| WD        | indeterminate determiner          |
| WNUM      | indeterminate numeral             |
| WPRO      | indeterminate pronoun             |

================================================
FILE: docs/annotations/pos/pku.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# pku

| 序号 | 词性 | 名称     | 帮助记忆的诠释                                         | 例子及注解                                                   |
| ---- | ---- | -------- | ------------------------------------------------------ | ------------------------------------------------------------ |
| 1    | Ag   | 形语素   | 形容词性语素。形容词代码为a，语素代码ｇ前面置以A。     | 绿色/n 似/d 锦/Ag ，                                         |
| 2    | a    | 形容词   | 取英语形容词adjective的第1个字母                       | [重要/a 步伐/n]NP ，美丽/a ，看似/v 抽象/a ，                |
| 3    | ad   | 副形词   | 直接作状语的形容词。形容词代码a和副词代码d并在一起。   | [积极/ad 谋求/v]V-ZZ ，幻象/n 易/ad 逝/Vg ，                 |
| 4    | an   | 名形词   | 具有名词功能的形容词。形容词代码a和名词代码n并在一起。 | [外交/n 和/c 安全/an]NP-BL ，                                |
| 5    | Bg   | 区别语素 | 区别词性语素。区别词代码为b，语素代码ｇ前面置以B。     | 赤/Ag 橙/Bg 黄/a 绿/a 青/a 蓝/a 紫/a ，                      |
| 6    | b    | 区别词   | 取汉字“别”的声母。                                     | 女/b 司机/n， 金/b 手镯/n， 慢性/b 胃炎/n， 古/b 钱币/n， 副/b 主任/n， 总/b 公司/n 单音节区别词和单音节名词或名语素组合，作为一个词，并标以名词词性n。 |
| 7    | c    | 连词     | 取英语连词conjunction的第1个字母。                     | 合作/vn 与/c 伙伴/n                                          |
| 8    | Dg   | 副语素   | 副词性语素。副词代码为d，语素代码ｇ前面置以D。         | 了解/v 甚/Dg 深/a ，煞/Dg 是/v 喜人/a ，                     |
| 9    | d    | 副词     | 取adverb的第2个字母，因其第1个字母已用于形容词。       | 进一步/d 发展/v ，                                           |
| 10   | e    | 叹词     | 取英语叹词exclamation的第1个字母。                     | 啊/e ，/w 那/r 金灿灿/z 的/u 麦穗/n ，                       |
| 11   | f    | 方位词   | 取汉字“方”。                                           | 军人/n 的/u 眼睛/n 里/f 不/d 是/v 没有/v 风景/n ，           |
| 12   | h    | 前接成分 | 取英语head的第1个字母。                                | 许多/m 非/h 主角/n 人物/n ，办事处/n 的/u “/w 准/h 政府/n ”/w 功能/n 不断/d 加强/v ， |
| 13   | i    | 成语     | 取英语成语idiom的第1个字母。                           | 一言一行/i ，义无反顾/i ，                                   |
| 14   | j    | 简称略语 | 取汉字“简”的声母。                                     | [德/j 外长/n]NP ，文教/j ，                                  |
| 15   | k    | 后接成分 | 后接成分。                                             | 少年儿童/l 朋友/n 们/k ，身体/n 健康/a 者/k ，               |
| 16   | l    | 习用语   | 习用语尚未成为成语，有点“临时性”，取“临”的声母。       | 少年儿童/l 朋友/n 们/k ，落到实处/l ，                       |
| 17   | Mg   | 数语素   | 数词性语素。数词代码为m，语素代码ｇ前面置以M。         | 甲/Mg 减下/v 的/u 人/n 让/v 乙/Mg 背上/v ，凡/d “/w 寅/Mg 年/n ”/w 中/f 出生/v 的/u 人/n 生肖/n 都/d 属/v 虎/n ， |
| 18   | m    | 数词     | 取英语numeral的第3个字母，n，u已有他用。               | 1．数量词组应切分为数词和量词。 三/m 个/q， 10/m 公斤/q， 一/m 盒/q 点心/n ，但少数数量词已是词典的登录单位，则不再切分。 一个/m ， 一些/m ，2. 基数、序数、小数、分数、百分数一律不予切分，为一个切分单位，标注为 m 。一百二十三/m，20万/m， 123.54/m， 一个/m， 第一/m， 第三十五/m， 20%/m， 三分之二/m， 千分之三十/m， 几十/m 人/n， 十几万/m 元/q， 第一百零一/m 个/q ，3. 约数，前加副词、形容词或后加“来、多、左右”等助数词的应予分开。约/d 一百/m 多/m 万/m，仅/d 一百/m 个/q， 四十/m 来/m 个/q，二十/m 余/m 只/q， 十几/m 个/q，三十/m 左右/m ，两个数词相连的及“成百”、“上千”等则不予切分。五六/m 年/q， 七八/m 天/q，十七八/m 岁/q， 成百/m 学生/n，上千/m 人/n， 4．表序关系的“数＋名”结构，应予切分。二/m 连/n ，　三/m 部/n ， |
| 19   | Ng   | 名语素   | 名词性语素。名词代码为n，语素代码ｇ前面置以N。         | 出/v 过/u 两/m 天/q 差/Ng， 理/v 了/u 一/m 次/q 发/Ng，      |
| 20   | n    | 名词     | 取英语名词noun的第1个字母。                            | （参见 动词--v）岗位/n ， 城市/n ， 机会/n ，她/r 是/v 责任/n 编辑/n ， |
| 21   | nr   | 人名     | 名词代码n和“人(ren)”的声母并在一起。                   | 1. 汉族人及与汉族起名方式相同的非汉族人的姓和名单独切分，并分别标注为nr。张/nr 仁伟/nr， 欧阳/nr 修/nr， 阮/nr 志雄/nr， 朴/nr 贞爱/nr汉族人除有单姓和复姓外，还有双姓，即有的女子出嫁后，在原来的姓上加上丈夫的姓。如：陈方安生。这种情况切分、标注为：陈/nr 方/nr 安生/nr；唐姜氏，切分、标注为：唐/nr 姜氏/nr。2. 姓名后的职务、职称或称呼要分开。江/nr 主席/n， 小平/nr 同志/n， 江/nr 总书记/n，张/nr 教授/n， 王/nr 部长/n， 陈/nr 老总/n， 李/nr 大娘/n， 刘/nr 阿姨/n， 龙/nr 姑姑/n3. 对人的简称、尊称等若为两个字，则合为一个切分单位，并标以nr。老张/nr， 大李/nr， 小郝/nr， 郭老/nr， 陈总/nr4. 明显带排行的亲属称谓要切分开，分不清楚的则不切开。三/m 哥/n， 大婶/n， 大/a 女儿/n， 大哥/n， 小弟/n， 老爸/n5. 一些著名作者的或不易区分姓和名的笔名通常作为一个切分单位。鲁迅/nr， 茅盾/nr， 巴金/nr， 三毛/nr， 琼瑶/nr， 白桦/nr6. 外国人或少数民族的译名（包括日本人的姓名）不予切分，标注为nr。克林顿/nr， 叶利钦/nr， 才旦卓玛/nr， 小林多喜二/nr， 北研二/nr，华盛顿/nr， 爱因斯坦/nr有些西方人的姓名中有小圆点，也不分开。卡尔·马克思/nr |
| 22   | ns   | 地名     | 名词代码n和处所词代码s并在一起。                       | 安徽/ns，深圳/ns，杭州/ns，拉萨/ns，哈尔滨/ns， 呼和浩特/ns， 乌鲁木齐/ns，长江/ns，黄海/ns，太平洋/ns， 泰山/ns， 华山/ns，亚洲/ns， 海南岛/ns，太湖/ns，白洋淀/ns， 俄罗斯/ns，哈萨克斯坦/ns，彼得堡/ns， 伏尔加格勒/ns 1. 国名不论长短，作为一个切分单位。中国/ns， 中华人民共和国/ns， 日本国/ns， 美利坚合众国/ns， 美国/ns2. 地名后有“省”、“市”、“县”、“区”、“乡”、“镇”、“村”、“旗”、“州”、“都”、“府”、“道”等单字的行政区划名称时，不切分开，作为一个切分单位。四川省/ns， 天津市/ns，景德镇/ns沙市市/ns， 牡丹江市/ns，正定县/ns，海淀区/ns， 通州区/ns，东升乡/ns， 双桥镇/ns 南化村/ns，华盛顿州/ns，俄亥俄州/ns，东京都/ns， 大阪府/ns，北海道/ns， 长野县/ns，开封府/ns，宣城县/ns3. 地名后的行政区划有两个以上的汉字，则将地名同行政区划名称切开，不过要将地名同行政区划名称用方括号括起来，并标以短语NS。[芜湖/ns 专区/n] NS，[宣城/ns 地区/n]ns，[内蒙古/ns 自治区/n]NS，[深圳/ns 特区/n]NS， [厦门/ns 经济/n 特区/n]NS， [香港/ns 特别/a 行政区/n]NS，[香港/ns 特区/n]NS， [华盛顿/ns 特区/n]NS，4. 地名后有表示地形地貌的一个字的普通名词，如“江、河、山、洋、海、岛、峰、湖”等，不予切分。鸭绿江/ns，亚马逊河/ns， 喜马拉雅山/ns， 珠穆朗玛峰/ns，地中海/ns，大西洋/ns，洞庭湖/ns， 塞普路斯岛/ns 5. 地名后接的表示地形地貌的普通名词若有两个以上汉字，则应切开。然后将地名同该普通名词标成短语NS。[台湾/ns 海峡/n]NS，[华北/ns 平原/n]NS，[帕米尔/ns 高原/n]NS， [南沙/ns 群岛/n]NS，[京东/ns 大/a 峡谷/n]NS [横断/b 山脉/n]NS6．地名后有表示自然区划的一个字的普通名词，如“ 街，路，道，巷，里，町，庄，村，弄，堡”等，不予切分。 中关村/ns，长安街/ns，学院路/ns， 景德镇/ns， 吴家堡/ns， 庞各庄/ns， 三元里/ns，彼得堡/ns， 北菜市巷/ns， 7．地名后接的表示自然区划的普通名词若有两个以上汉字，则应切开。然后将地名同自然区划名词标成短语NS。[米市/ns 大街/n]NS， [蒋家/nz 胡同/n]NS ， [陶然亭/ns 公园/n]NS ， 8． 大小地名相连时的标注方式为：北京市/ns 海淀区/ns 海淀镇/ns [南/f 大街/n]NS [蒋家/nz 胡同/n]NS 24/m 号/q ， |
| 23   | nt   | 机构团体 | “团”的声母为t，名词代码n和t并在一起。                  | （参见2。短语标记说明--NT）联合国/nt，中共中央/nt，国务院/nt， 北京大学/nt1．大多数团体、机构、组织的专有名称一般是短语型的，较长，且含有地名或人名等专名，再组合，标注为短语NT。[中国/ns 计算机/n 学会/n]NT， [香港/ns 钟表业/n 总会/n]NT， [烟台/ns 大学/n]NT， [香港/ns 理工大学/n]NT， [华东/ns 理工大学/n]NT，[合肥/ns 师范/n 学院/n]NT， [北京/ns 图书馆/n]NT， [富士通/nz 株式会社/n]NT， [香山/ns 植物园/n]NT， [安娜/nz 美容院/n]NT，[上海/ns 手表/n 厂/n]NT， [永和/nz 烧饼铺/n]NT，[北京/ns 国安/nz 队/n]NT，2. 对于在国际或中国范围内的知名的唯一的团体、机构、组织的名称即使前面没有专名，也标为nt或NT。联合国/nt，国务院/nt，外交部/nt， 财政部/nt，教育部/nt， 国防部/nt，[世界/n 贸易/n 组织/n]NT， [国家/n 教育/vn 委员会/n]NT，[信息/n 产业/n 部/n]NT，[全国/n 信息/n 技术/n 标准化/vn 委员会/n]NT，[全国/n 总/b 工会/n]NT，[全国/n 人民/n 代表/n 大会/n]NT，美国的“国务院”，其他国家的“外交部、财政部、教育部”，必须在其所属国的国名之后出现时，才联合标注为NT。[美国/ns 国务院/n]NT，[法国/ns 外交部/n]NT，[美/j 国会/n]NT，日本有些政府机构名称很特别，无论是否出现在“日本”国名之后都标为nt。[日本/ns 外务省/nt]NT，[日/j 通产省/nt]NT通产省/nt 3. 前后相连有上下位关系的团体机构组织名称的处理方式如下:[联合国/nt 教科文/j 组织/n]NT， [中国/ns 银行/n 北京/ns 分行/n]NT，[河北省/ns 正定县/ns 西平乐乡/ns 南化村/ns 党支部/n]NT， 当下位名称含有专名（如“北京/ns 分行/n”、“南化村/ns 党支部/n”、“昌平/ns 分校/n”）时，也可脱离前面的上位名称单独标注为NT。[中国/ns 银行/n]NT [北京/ns 分行/n]NT，北京大学/nt [昌平/ns 分校/n]NT，4. 团体、机构、组织名称中用圆括号加注简称时:[宝山/ns 钢铁/n （/w 宝钢/j ）/w 总/b 公司/n]NT，[宝山/ns 钢铁/n 总/b 公司/n]NT，（/w 宝钢/j ）/w |
| 24   | nx   | 外文字符 | 外文字符。                                             | A/nx 公司/n ，B/nx 先生/n ，X/nx 君/Ng ，24/m K/nx 镀金/n ，C/nx 是/v 光速/n ，Windows98/nx ，PentiumIV/nx ，I LOVE THIS GAME/nx ，HanLP/nx |
| 25   | nz   | 其他专名 | “专”的声母的第1个字母为z，名词代码n和z并在一起。       | （参见2。短语标记说明--NZ）除人名、国名、地名、团体、机构、组织以外的其他专有名词都标以nz。满族/nz，俄罗斯族/nz，汉语/nz，罗马利亚语/nz， 捷克语/nz，中文/nz， 英文/nz， 满人/nz， 哈萨克人/nz， 诺贝尔奖/nz， 茅盾奖/nz， 1.包含专有名称（或简称）的交通线，标以nz；短语型的，标为NZ。津浦路/nz， 石太线/nz， [京/j 九/j 铁路/n]NZ， [京/j 津/j 高速/b 公路/n]NZ， 2. 历史上重要事件、运动等专有名称一般是短语型的，按短语型专有名称处理，标以NZ。[卢沟桥/ns 事件/n]NZ， [西安/ns 事变/n]NZ，[五四/t 运动/n]NZ， [明治/nz 维新/n]NZ，[甲午/t 战争/n]NZ，3.专有名称后接多音节的名词，如“语言”、“文学”、“文化”、“方式”、“精神”等，失去专指性，则应分开。欧洲/ns 语言/n， 法国/ns 文学/n， 西方/ns 文化/n， 贝多芬/nr 交响乐/n， 雷锋/nr 精神/n， 美国/ns 方式/n，日本/ns 料理/n， 宋朝/t 古董/n 4. 商标（包括专名及后接的“牌”、“型”等）是专指的，标以nz，但其后所接的商品仍标以普通名词n。康师傅/nr 方便面/n， 中华牌/nz 香烟/n， 牡丹III型/nz 电视机/n， 联想/nz 电脑/n， 鳄鱼/nz 衬衣/n， 耐克/nz 鞋/n5. 以序号命名的名称一般不认为是专有名称。2/m 号/q 国道/n ，十一/m 届/q 三中全会/j如果前面有专名，合起来作为短语型专名。[中国/ns 101/m 国道/n]NZ， [中共/j 十一/m 届/q 三中全会/j]NZ，6. 书、报、杂志、文档、报告、协议、合同等的名称通常有书名号加以标识，不作为专有名词。由于这些名字往往较长，名字本身按常规处理。《/w 宁波/ns 日报/n 》/w ，《/w 鲁迅/nr 全集/n 》/w，中华/nz 读书/vn 报/n， 杜甫/nr 诗选/n，少数书名、报刊名等专有名称，则不切分。红楼梦/nz， 人民日报/nz，儒林外史/nz 7. 当有些专名无法分辨它们是人名还是地名或机构名时，暂标以nz。[巴黎/ns 贝尔希/nz 体育馆/n]NT，其中“贝尔希”只好暂标为nz。 |
| 26   | o    | 拟声词   | 取英语拟声词onomatopoeia的第1个字母。                  | 哈哈/o 一/m 笑/v ，装载机/n 隆隆/o 推进/v ，                 |
| 27   | p    | 介词     | 取英语介词prepositional的第1个字母。                   | 对/p 子孙后代/n 负责/v ，以/p 煤/n 养/v 农/Ng ，为/p 治理/v 荒山/n 服务/v ， 把/p 青年/n 推/v 上/v 了/u 领导/vn 岗位/n ， |
| 28   | q    | 量词     | 取英语quantity的第1个字母。                            | （参见数词m）首/m 批/q ，一/m 年/q ，                        |
| 29   | Rg   | 代语素   | 代词性语素。代词代码为r，在语素的代码g前面置以R。      | 读者/n 就/d 是/v 这/r 两/m 棵/q 小树/n 扎根/v 于/p 斯/Rg 、/w 成长/v 于/p 斯/Rg 的/u 肥田/n 沃土/n ， |
| 30   | r    | 代词     | 取英语代词pronoun的第2个字母，因p已用于介词。          | 单音节代词“本”、“每”、“各”、“诸”后接单音节名词时，和后接的单音节名词合为代词；当后接双音节名词时，应予切分。本报/r， 每人/r， 本社/r， 本/r 地区/n， 各/r 部门/n |
| 31   | s    | 处所词   | 取英语space的第1个字母。                               | 家里/s 的/u 电脑/n 都/d 联通/v 了/u 国际/n 互联网/n ，西部/s 交通/n 咽喉/n ， |
| 32   | Tg   | 时语素   | 时间词性语素。时间词代码为t，在语素的代码g前面置以T。  | ３日/t 晚/Tg 在/p 总统府/n 发表/v 声明/n ，尊重/v 现/Tg 执政/vn 当局/n 的/u 权威/n ， |
| 33   | t    | 时间词   | 取英语time的第1个字母。                                | 1. 年月日时分秒，按年、月、日、时、分、秒切分，标注为t 。1997年/t 3月/t 19日/t 下午/t 2时/t 18分/t若数字后无表示时间的“年、月、日、时、分、秒”等的标为数词m。1998/m 中文/n 信息/n 处理/vn 国际/n 会议/n 2. 历史朝代的名称虽然有专有名词的性质，仍标注为t。西周/t， 秦朝/t， 东汉/t， 南北朝/t， 清代/t“牛年、虎年”等一律不予切分，标注为：牛年/t， 虎年/t， 甲午年/t， 甲午/t 战争/n， 庚子/t 赔款/n， 戊戌/t 变法/n |
| 34   | u    | 助词     | 取英语助词auxiliary。                                  | [[俄罗斯/ns 和/c 北约/j]NP-BL 之间/f [战略/n 伙伴/n 关系/n]NP 的/u 建立/vn]NP 填平/v 了/u [[欧洲/ns 安全/a 政治/n]NP 的/u 鸿沟/n]NP |
| 35   | Vg   | 动语素   | 动词性语素。动词代码为v。在语素的代码g前面置以V。      | 洗/v 了/u 一个/m 舒舒服服/z 的/u 澡/Vg                       |
| 36   | v    | 动词     | 取英语动词verb的第一个字母。                           | （参见 名词--n）[[[欧盟/j 扩大/v]S 的/u [历史性/n 决定/n]NP]NP 和/c [北约/j 开放/v]S]NP-BL [为/p [创建/v [一/m 种/q 新/a 的/u 欧洲/ns 安全/a 格局/n]NP]VP-SBI]PP-MD [奠定/v 了/u 基础/n]V-SBI ，， |
| 37   | vd   | 副动词   | 直接作状语的动词。动词和副词的代码并在一起。           | 形势/n 会/v 持续/vd 好转/v ，认为/v 是/v 电话局/n 收/v 错/vd 了/u 费/n ， |
| 38   | vn   | 名动词   | 指具有名词功能的动词。动词和名词的代码并在一起。       | 引起/v 人们/n 的/u 关注/vn 和/c 思考/vn ，收费/vn 电话/n 的/u 号码/n ， |
| 39   | w    | 标点符号 |                                                        | ”/w ：/w                                                     |
| 40   | x    | 非语素字 | 非语素字只是一个符号，字母x通常用于代表未知数、符号。  |                                                              |
| 41   | Yg   | 语气语素 | 语气词性语素。语气词代码为y。在语素的代码g前面置以Y。  | 唯/d 大力/d 者/k 能/v 致/v 之/u 耳/Yg                        |
| 42   | y    | 语气词   | 取汉字“语”的声母。                                     | 会/v 泄露/v 用户/n 隐私/n 吗/y ，又/d 何在/v 呢/y ？         |
| 43   | z    | 状态词   | 取汉字“状”的声母的前一个字母。                         | 取得/v 扎扎实实/z 的/u 突破性/n 进展/vn ，四季/n 常青/z 的/u 热带/n 树木/n ，短短/z 几/m 年/q 间， |

================================================
FILE: docs/annotations/pos/ud.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# Universal Dependencies

See also [Universal Dependencies](https://universaldependencies.org/u/pos/).

| Tag        | Description                                  |
|------------|----------------------------------------------|
| ADJ   | adjective                 |
| ADP   | adposition                |
| ADV   | adverb                    |
| AUX   | auxiliary                 |
| CCONJ | coordinating conjunction  |
| DET   | determiner                |
| INTJ  | interjection              |
| NOUN  | noun                      |
| NUM   | numeral                   |
| PART  | particle                  |
| PRON  | pronoun                   |
| PROPN | proper noun               |
| PUNCT | punctuation               |
| SCONJ | subordinating conjunction |
| SYM   | symbol                    |
| VERB  | verb                      |
| X     | other                     |

================================================
FILE: docs/annotations/sdp/dm.md
================================================
# The reduction of Minimal Recursion Semantics

Please refer to [Minimal Recursion Semantics An Introduction](https://www.cl.cam.ac.uk/~aac10/papers/mrs.pdf).


================================================
FILE: docs/annotations/sdp/index.md
================================================
# Semantic Dependency Parsing

## Chinese

```{toctree}
semeval16
```

## English

```{toctree}
dm
pas
psd
```


================================================
FILE: docs/annotations/sdp/pas.md
================================================
# Predicate-Argument Structures

Please refer to [Probabilistic disambiguation models for wide-coverage HPSG parsing](https://www.aclweb.org/anthology/P05-1011.pdf).


================================================
FILE: docs/annotations/sdp/psd.md
================================================
# Prague Czech-English Dependency Treebank

Please refer to [Prague Czech-English Dependency Treebank](http://ufal.mff.cuni.cz/pcedt2.0/en/index.html).


================================================
FILE: docs/annotations/sdp/semeval16.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# SemEval2016

## CSDP

SemEval2016 adopts the CSDP guideline listed as follows.

### 语义关系标注标签集

| 分类         |              |                 |                                                              |
| ------------ | ------------ | --------------- | ------------------------------------------------------------ |
| 语义周边角色 | 主体角色     | 施事AGT；       | 施事Agt；感事Aft                                             |
|              |              | 当事EXP；       | 当事Exp；领事Poss                                            |
|              | 客体角色     | 受事PAT；       | 受事Pat                                                      |
|              |              | 客事CONT；      | 客事Cont；成事Prod；结局Cons                                 |
|              |              | 涉事DATV；      | 涉事Datv；比较Comp；源事Orig                                 |
|              |              | 系事LINK；      | 类事Clas；属事Belg                                           |
|              | 情境角色     | 工具TOOL；      | 工具Tool                                                     |
|              |              | 材料MATL；      | 材料Matl                                                     |
|              |              | 方式MANN；      | 方式Mann；依据Accd                                           |
|              |              | 范围SCO；       | 范围Sco                                                      |
|              |              | 缘由REAS；      | 缘故Reas；意图Int                                            |
|              |              | 时间TIME；      | 时间Time；时间起点Tini；时间终点Tfin；时段Tdur；时距Trang    |
|              |              | 空间LOC；       | 空间Loc；原处所Lini；终处所Lfin；通过处所Lthru；趋向Dir      |
|              |              | 度量MEAS；      | 数量Quan；起始量Nini；终止量Nfin；数量短语Qp；频率Freq；顺序Seq；变化量Nvar |
|              |              | 状态STAT；      | 状态Stat；起始状态Sini；终止状态Sfin；历经状态Sproc          |
|              |              | 修饰FEAT；      | 描写Desc；宿主Host；名词修饰语Nmod；时间修饰语Tmod           |
| 语义结构关系 | 反关系       | 反施事rAGT；    | 反施事rAgt；反感事rAft                                       |
|              |              | 反当事rEXP。    | 反当事rExp；反领事rPoss                                      |
|              |              | 反受事rPAT；    | 反受事rPat                                                   |
|              |              | 反客事rCONT；   | 反客事rCont；反成事rProd；反结局rCons                        |
|              |              | 反涉事rDATV；   | 反涉事rDatv；反比较rComp；反源事rOrig                        |
|              |              | 反系事rLINK。   | 反类事rClas；反属事rBelg                                     |
|              |              | 反工具rTOOL；   | 反工具rTool                                                  |
|              |              | 反材料rMATL；   | 反材料rMatl                                                  |
|              |              | 反方式RMANN；   | 反方式rMann；反依据rAccd                                     |
|              |              | 反范围rSCO；    | 反范围rSco                                                   |
|              |              | 反缘由rREAS；   | 反缘故rReas；反意图rInt                                      |
|              |              | 反时间rTIME；   | 反时间rTime；反时间起点rTini；反时间终点rTfin；反时段rTdur；反时距rTrang |
|              |              | 反空间rLOC；    | 反空间rLoc；反原处所rLini；反终处所rLfin；反通过处所rLthru；反趋向rDir |
|              |              | 反度量rMEAS；   | 反数量rQuan；反起始量rNini；反终止量rNfin；反数量短语rQp；反频率rFreq；反顺序rSeq；反变化量rNvar |
|              |              | 反状态rSTAT；   | 反状态rStat；反起始状态rSini；反终止状态rSfin；反历经状态rSproc |
|              |              | 反修饰rFEAT；   | 反描写rDesc；反宿主rHost; 反名词修饰语rNmod; 反时间修饰语rTmod |
|              | 嵌套事件关系 | 嵌套施事dAGT；  | 嵌套施事dAgt；嵌套感事dAft                                   |
|              |              | 嵌套当事dEXP。  | 嵌套当事dExp；嵌套领事dPoss                                  |
|              |              | 嵌套受事dPAT；  | 嵌套受事dPat                                                 |
|              |              | 嵌套客事dCONT； | 嵌套客事dCont；嵌套成事dProd；嵌套结局dCons                  |
|              |              | 嵌套涉事dDATV； | 嵌套涉事dDatv；嵌套比较dComp；嵌套源事dOrig                  |
|              |              | 嵌套系事dLINK。 | 嵌套类事dClas；嵌套属事dBelg                                 |
|              |              | 嵌套工具dTOOL； | 嵌套工具dTool                                                |
|              |              | 嵌套材料dMATL； | 嵌套材料dMatl                                                |
|              |              | 嵌套方式dMANN； | 嵌套方式dMann；嵌套依据dAccd                                 |
|              |              | 嵌套范围dSCO；  | 嵌套范围dSco                                                 |
|              |              | 嵌套缘由dREAS； | 嵌套缘故dReas；嵌套意图dInt                                  |
|              |              | 嵌套时间dTIME； | 嵌套时间dTime；嵌套时间起点dTini；嵌套时间终点dTfin；嵌套时段dTdur；嵌套时距dTrang |
|              |              | 嵌套空间dLOC；  | 嵌套空间dLoc；嵌套原处所dLini；嵌套终处所dLfin；嵌套通过处所dLthru；嵌套趋向dDir |
|              |              | 嵌套度量dMEAS； | 嵌套数量dQuan；嵌套起始量dNini；嵌套终止量dNfin；嵌套数量短语dQp；嵌套频率dFreq；嵌套顺序dSeq；嵌套变化量dNvar |
|              |              | 嵌套状态dSTAT； | 嵌套状态dStat；嵌套起始状态dSini；嵌套终止状态dSfin；嵌套历经状态dSproc |
|              |              | 嵌套修饰dFEAT； | 嵌套描写dDesc；嵌套宿主dHost; 嵌套名词修饰语dNmod; 嵌套时间修饰语dTmod |
|              | 事件关系     | 并列关系eCOO；  | 并列eCoo；等同eEqu；分叙eRect；选择eSelt;割舍eAban；选取ePref；总括eSum |
|              |              | 先行关系ePREC； | 先行ePrec；原因eCau；条件eCond；假设eSupp；手段eMetd；让步eConc |
|              |              | 后继关系eSUCC； | 后继eSucc；递进eProg；转折 eAdvt；目的ePurp；结果eResu；推论eInf |
| 语义依附标记 | 标点标记     | 标点标记mPUNC； | 标点标记mPunc                                                |
|              | 依附标记     | 否定标记mNEG；  | 否定标记mNeg                                                 |
|              |              | 关系标记mRELA； | 连词标记mConj；介词标记mPrep                                 |
|              |              | 依附标记mDEPD； | 语气标记mTone；时间标记mTime;范围标记mRang；情态标记mMod； 频率标记mFreq；程度标记mDegr；趋向标记mDir；的字标记mAux； 多数标记mMaj；插入语标记mPars；离合标记mSepa；实词虚化标记mVain 重复标记mRept |

## SemEval2016

The following table is a subset of CSDP but offers some examples to illustrate the idea.

| 关系类型   | Tag           | Description        | Example                     |
|--------|---------------|--------------------|-----------------------------|
| 施事关系   | Agt           | Agent              | 我送她一束花 (我 <– 送)             |
| 当事关系   | Exp           | Experiencer        | 我跑得快 (跑 –> 我)               |
| 感事关系   | Aft           | Affection          | 我思念家乡 (思念 –> 我)             |
| 领事关系   | Poss          | Possessor          | 他有一本好读 (他 <– 有)             |
| 受事关系   | Pat           | Patient            | 他打了小明 (打 –> 小明)             |
| 客事关系   | Cont          | Content            | 他听到鞭炮声 (听 –> 鞭炮声)           |
| 成事关系   | Prod          | Product            | 他写了本小说 (写 –> 小说)            |
| 源事关系   | Orig          | Origin             | 我军缴获敌人四辆坦克 (缴获 –> 坦克)       |
| 涉事关系   | Datv          | Dative             | 他告诉我个秘密 ( 告诉 –> 我 )         |
| 比较角色   | Comp          | Comitative         | 他成绩比我好 (他 –> 我)             |
| 属事角色   | Belg          | Belongings         | 老赵有俩女儿 (老赵 <– 有)            |
| 类事角色   | Clas          | Classification     | 他是中学生 (是 –> 中学生)            |
| 依据角色   | Accd          | According          | 本庭依法宣判 (依法 <– 宣判)           |
| 缘故角色   | Reas          | Reason             | 他在愁女儿婚事 (愁 –> 婚事)           |
| 意图角色   | Int           | Intention          | 为了金牌他拼命努力 (金牌 <– 努力)        |
| 结局角色   | Cons          | Consequence        | 他跑了满头大汗 (跑 –> 满头大汗)         |
| 方式角色   | Mann          | Manner             | 球慢慢滚进空门 (慢慢 <– 滚)           |
| 工具角色   | Tool          | Tool               | 她用砂锅熬粥 (砂锅 <– 熬粥)           |
| 材料角色   | Malt          | Material           | 她用小米熬粥 (小米 <– 熬粥)           |
| 时间角色   | Time          | Time               | 唐朝有个李白 (唐朝 <– 有)            |
| 空间角色   | Loc           | Location           | 这房子朝南 (朝 –> 南)              |
| 历程角色   | Proc          | Process            | 火车正在过长江大桥 (过 –> 大桥)         |
| 趋向角色   | Dir           | Direction          | 部队奔向南方 (奔 –> 南)             |
| 范围角色   | Sco           | Scope              | 产品应该比质量 (比 –> 质量)           |
| 数量角色   | Quan          | Quantity           | 一年有365天 (有 –> 天)            |
| 数量数组   | Qp            | Quantity-phrase    | 三本书 (三 –> 本)                |
| 频率角色   | Freq          | Frequency          | 他每天看书 (每天 <– 看)             |
| 顺序角色   | Seq           | Sequence           | 他跑第一 (跑 –> 第一)              |
| 描写角色   | Desc(Feat)    | Description        | 他长得胖 (长 –> 胖)               |
| 宿主角色   | Host          | Host               | 住房面积 (住房 <– 面积)             |
| 名字修饰角色 | Nmod          | Name-modifier      | 果戈里大街 (果戈里 <– 大街)           |
| 时间修饰角色 | Tmod          | Time-modifier      | 星期一上午 (星期一 <– 上午)           |
| 反角色    | r + main role |                    | 打篮球的小姑娘 (打篮球 <– 姑娘)         |
| 嵌套角色   | d + main role |                    | 爷爷看见孙子在跑 (看见 –> 跑)          |
| 并列关系   | eCoo          | event Coordination | 我喜欢唱歌和跳舞 (唱歌 –> 跳舞)         |
| 选择关系   | eSelt         | event Selection    | 您是喝茶还是喝咖啡 (茶 –> 咖啡)         |
| 等同关系   | eEqu          | event Equivalent   | 他们三个人一起走 (他们 –> 三个人)        |
| 先行关系   | ePrec         | event Precedent    | 首先，先                        |
| 顺承关系   | eSucc         | event Successor    | 随后，然后                       |
| 递进关系   | eProg         | event Progression  | 况且，并且                       |
| 转折关系   | eAdvt         | event adversative  | 却，然而                        |
| 原因关系   | eCau          | event Cause        | 因为，既然                       |
| 结果关系   | eResu         | event Result       | 因此，以致                       |
| 推论关系   | eInf          | event Inference    | 才，则                         |
| 条件关系   | eCond         | event Condition    | 只要，除非                       |
| 假设关系   | eSupp         | event Supposition  | 如果，要是                       |
| 让步关系   | eConc         | event Concession   | 纵使，哪怕                       |
| 手段关系   | eMetd         | event Method       |                             |
| 目的关系   | ePurp         | event Purpose      | 为了，以便                       |
| 割舍关系   | eAban         | event Abandonment  | 与其，也不                       |
| 选取关系   | ePref         | event Preference   | 不如，宁愿                       |
| 总括关系   | eSum          | event Summary      | 总而言之                        |
| 分叙关系   | eRect         | event Recount      | 例如，比方说                      |
| 连词标记   | mConj         | Conjunction        | 和，或                         |
| 的字标记   | mAux          | Auxiliary          | 的，地，得                       |
| 介词标记   | mPrep         | Preposition        | 把，被                         |
| 语气标记   | mTone         | Tone               | 吗，呢                         |
| 时间标记   | mTime         | Time               | 才，曾经                        |
| 范围标记   | mRang         | Range              | 都，到处                        |
| 程度标记   | mDegr         | Degree             | 很，稍微                        |
| 频率标记   | mFreq         | Frequency Marker   | 再，常常                        |
| 趋向标记   | mDir          | Direction Marker   | 上去，下来                       |
| 插入语标记  | mPars         | Parenthesis Marker | 总的来说，众所周知                   |
| 否定标记   | mNeg          | Negation Marker    | 不，没，未                       |
| 情态标记   | mMod          | Modal Marker       | 幸亏，会，能                      |
| 标点标记   | mPunc         | Punctuation Marker | ，。！                         |
| 重复标记   | mPept         | Repetition Marker  | 走啊走 (走 –> 走)                |
| 多数标记   | mMaj          | Majority Marker    | 们，等                         |
| 实词虚化标记 | mVain         | Vain Marker        |                             |
| 离合标记   | mSepa         | Seperation Marker  | 吃了个饭 (吃 –> 饭) 洗了个澡 (洗 –> 澡) |
| 根节点    | Root          | Root               | 全句核心节点                      |

See also [SemEval-2016 Task 9](https://www.hankcs.com/nlp/sdp-corpus.html) and [CSDP](https://csdp-doc.readthedocs.io/zh_CN/latest/%E9%99%84%E5%BD%95/).


================================================
FILE: docs/annotations/srl/cpb.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# Chinese Proposition Bank

|      | 标签       | 角色    | 例子                      |
|------|----------|-------|-------------------------|
| 中心角色 | ARG0     | 施事者   | (ARG0中国政府)提供援助         |
|      | ARG1     | 受事者   | 中国政府提供(ARG1援助)          |
|      | ARG2     | 依谓词而定 | 失业率控制(ARG2在百分之十内)       |
|      | ARG3     | 依谓词而定 | (ARG3从城市)扩大到农村          |
|      | ARG4     | 依谓词而定 | 提高(ARG4百分之二十)          |
| 附属角色 | ARGM-ADV | 状语    | (ARGM-ADV共同)承担          |
|      | ARGM-BNF | 受益者   | (ARGM-BNF为其他国家)进行融资     |
|      | ARGM-CND | 条件    | (ARGM-CND如果成功)，他就留下     |
|      | ARGM-DIR | 方向    | (ARGM-DIR向和平)迈出一大步      |
|      | ARGM-EXT | 范围    | 在北京逗留(ARGM-EXT两天)      |
|      | ARGM-FRQ | 频率    | 每半年执行(ARGM-FRQ一次)      |
|      | ARGM-LOC | 地点、位置 | (ARGM-LOC在机场)被捕获        |
|      | ARGM-MNR | 方式    | (ARGM-MNR以中英文)发行        |
|      | ARGM-PRP | 目的或原因 | (ARGM-PRP由于危机)而破产       |
|      | ARGM-TMP | 时间    | 公司(ARGM-TMP去年)成立       |
|      | ARGM-TPC | 主题    | (ARGM-TPC稳定政策)，核心是...   |
|      | ARGM-DIS | 话语标记  | (ARGM-DIS因此)，他感到不公      |
|      | ARGM-CRD | 并列论元  | (ARGM-CRD与台湾)非正式接触      |
|      | ARGM-PRD | 次谓词   | 指控廉政公署五人(ARGM-PRD接受贿赂) |


```{note}
Although ARG0 and ARG1 share general definitions across all predicates, word sense disambiguation is required to find 
the coresponding definition of semantic roles. Given the word sense of `变化`, say `变化-2`, 
[its second frameset](http://verbs.colorado.edu/chinese/cpb/html_frames/0183-bian-hua.html) can 
be found which defines the following 2 arguments:

1.    ARG0: agent/cause
2.    ARG1: entity arg0 changes

These definitions are different from that of frameset `变化-1`:

1.    ARG0: entity undergoing change
   
Sometimes, the number of arguments and definitions can vary a lot across framesets. 
In summary, word sense disambiguation is essential if SRL is to be used to best effect in practical applications  
```

================================================
FILE: docs/annotations/srl/index.md
================================================
# Semantic Role Labeling

## Chinese
```{toctree}
cpb
```

## English
```{toctree}
propbank
```


================================================
FILE: docs/annotations/srl/propbank.md
================================================
<!--
# ========================================================================
# Copyright 2020 hankcs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# ========================================================================
-->

# English PropBank

| Role | Description                                  |
|------|----------------------------------------|
| ARG0 | agent                                  |
| ARG1 | patient                                |
| ARG2 | instrument, benefactive, attribute     |
| ARG3 | starting point, benefactive, attribute |
| ARG4 | ending point                           |
| ARGM | modifier                               |
| COM  | Comitative                             |
| LOC  | Locative                               |
| DIR  | Directional                            |
| GOL  | Goal                                   |
| MNR  | Manner                                 |
| TMP  | Temporal                               |
| EXT  | Extent                                 |
| REC  | Reciprocals                            |
| PRD  | Secondary Predication                  |
| PRP  | Purpose                                |
| CAU  | Cause                                  |
| DIS  | Discourse                              |
| ADV  | Adverbials                             |
| ADJ  | Adjectival                             |
| MOD  | Modal                                  |
| NEG  | Negation                               |
| DSP  | Direct Speech                          |
| LVB  | Light Verb                             |
| CXN  | Construction                           |


================================================
FILE: docs/annotations/tok/ctb.md
================================================
The Segmentation Guidelines for the Penn Chinese Treebank (3.0)
===============================================================

Fei Xia

*University of Pennsylvania*

This is an OCR version. See also the [PDF version](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1038&context=ircs_reports).

## Abstract


This document describes the segmentation guidelines for the Penn Chinese Treebank Project. The goal of the project is the creation of a 100-thousand-word corpus of Mandarin Chinese text with syntactic bracketing. The Chinese Treebank has been released via the Linguistic Data Consortium (LDC) and is available to the public.

The segmentation guidelines have been revised several times during the two-year period of the project. The previous two versions were completed in December 1998 and March 1999, respectively. This document is the third and final version. We have added an introduction chapter in order to explain some rationale behind certain decisions in the guidelines. We also include the English gloss to the Chinese words in the guidelines.


In this document, we first discuss the notion of word and tests for wordhood that have been proposed in the literature. Then we give the specification for word segmentation. The specification is organized according to the potential Part-of-Speech tag of an expression and the internal structure of the expression. Next, we specify the treatment for some common collocations. Finally, we compare our guidelines with two segmentation standards: the first (Liu et al., 1993) is used in Mainland China and the second (CKIP, 1996) is used in Academia Sinica in Taiwan.

## Chapter 1 Introduction

This document is designed for the Penn Chinese Treebank Project [XPX+ 00]. The goal of the project is the creation of a 100-thousand word corpus of Mandarin Chinese text with syntactic bracketing. The annotation consists of two stages: the first phrase is word segmentation and part-of-speech (POS) tagging and the second phrase is syntactic bracketing. Each stage includes at least two passes, that is, the data are annotated by one annotator, then the resulting files are checked by another annotator. 

The segmentation guidelines, like POS guidelines and bracketing guidelines, have been revised several times during the project. So far, we have released all three versions on our web site: the first draft was completed in December 1998, after the first pass of word segmentation and POS tagging; the second draft in March 1999, after the second pass of word segmentation and POS tagging. This document, which is the third draft, is revised after the second pass of bracketing. The major changes in the third draft, compared with the previous two drafts, are (1) we add an introduction chapter in order to explain some rationale behind the guideline, (2) we add the gloss to the Chinese words in the guidelines,1 and (3) we also turn the guidelines into a technical report, which is published by the Institute for Research in Cognitive Science (IRCS) of the University of Pennsylvania.

### 1.1 Notion of *word*

The difficulty in defining the notion of word is not unique to Chinese,2 but the problem is certainly more severe for Chinese for a number of reasons. First, Chinese is not written with word delimiters so segmenting a sentence into "words" is not a natural task even for a native speaker. Second, Chinese has little inflectional morphology to ease word identification. Third, there is little consensus in the community on difficult constructions that could affect word segmentation. For instance, the segmentation of verb resultative compounds depends on the syntactic analysis of the construction. One view on how a verb resultative compound is formed says that a simple sentence with a compound is actually bi-clausal and the compound is formed by movement, therefore, the compound should be treated as two words. Another view believes that the compound is formed in the lexicon, and therefore should be one word. The segmentation of the verb resultative compounds depends on which view we adopt for this construction. Fourth, many monosyllabic morphemes that used to be able to stand alone in non-Modern Chinese become bound in Modern Chinese. The influence of non-Modern Chinese makes it difficult to draw the line between bound morphemes and free morphemes, the notions which could otherwise have been very useful for deciding word boundaries.


Our approach is based on both linguistic and engineering consideration. The notion word in our Treebank is roughly a syntactic atom as defined in [SW87], that is, anything that can be inserted into an X° position in syntax. This includes both compounds and simple words.

### 1.2 Tests of wordhood


What tests can be used to decide whether a string of hanzi[Chinese character] is a word or not? Without loss of generalization, we assume the string that we are trying to segment is X-Y, which has two morphemes X and Y. The following tests for establishing word boundaries have been proposed by various authors:


- Bound morpheme: a bound morpheme should be attached to its neighboring morpheme to form a word when possible.


- Productivity: if a rule that combines the expression X-Y does not apply generally (i.e., it is not productive), then X-Y is likely to be a word.


- Frequency of co-occurrence: if the expression X-Y occurs very often, it is likely to be a word.


- Complex internal structure: strings with complex internal structures should be segmented when possible.


- Compositionality: if the meaning of X-Y is not compositional, it is likely to be a word.


- Insertion: if another morpheme can be inserted between X and Y, then X-Y is unlikely to be a word.


- XP-substitution: if a morpheme can not be replaced by a phrase of the same type, then it is likely to be part of a word.


- The number of syllables: several guidelines [LTS93, Chi96] have used syllable numbers on certain cases. For example, in [LTS93], a verb resultative compound is treated as one word if the resultative part is monosyllabic, and it is treated as two words if the resultative part has more than one syllable.


All of these tests are very useful. However, none of them is sufficient by itself for covering the entire range of difficult cases. Either the test is applicable only to limited cases (e.g., the XP-substitution test) or there is no objective way to perform the test as the test refers to vaguely defined properties (e.g., in the productive test, it is not clear where to draw the line between a productive rule and a non-productive rule). For more discussion on this topic from the linguistics point of view, please refer to [Pac98, SW87].


Since no single test is sufficient, we chose a set of tests for our segmentation guidelines which includes all of the ones mentioned except for the productivity test and the frequency test. Rather than have the annotators try to memorize the entire set and make each decision from these principles, in the guidelines we spell out what the results of applying the tests would be for all of the relevant phenomena. For example, for the treatment of verb resultative compounds, we select the relevant tests (e.g., the number of syllables and the insertion test), and give several examples of the results of applying these tests to verb resultative compounds. This makes it straightforward, and thus efficient, for the annotators to follow the guidelines.

### 1.3 Compatibility with other guidelines


We have studied other groups, guidelines, such as the Segmentation Standard in China [LTS93] and the one in Taiwan [Chi96], and tried to accommodate them in our guidelines if possible.


Since the final result of the Treebank is a list of bracketed sentences, our guidelines have some flexibility with regards to the segmentation of certain constructions. For example, the string 走上来[walk up] is treated as two segments in [LTS93], but one segment in [Chi96]. In our Treebank, we will segment it into two parts, and then group them together as a compound ——that is, (走[walk]/V 上来[up]/V)/V. We call 走上来 a word with internal structures. Out annotation, in this case, is compatible with both [LTS93] and [Chi96]. The comparisons of these three guidelines can be found in Appendix A.


Note: For the sake of annotation efficiency, the grouping of the words with internal structure is done at bracketing stage, rather than at the segmentation stage. In this document, we show the grouping format, but keep in mind that the format is the one AFTER the bracketing is completed. For example, we consider 走上来[walk up] 2us one word. It is segmented into “走[walk]/V 上来[up]/V” at the segmentation stage, and it will be grouped into (走[walk]/V 上来[up]/V)/V at the bracketing stage. In the paper, we just say 走上来[walk up] should be annotated as (走[walk]/V 上来[up]/V)/V.


Most disagreements among these three guidelines do not make much difference to parsing or sentence interpretation. For most patterns for which the guidelines give different treatments (e.g., numbers and reduplication strings), simple conversion programs can be written to convert the data from one format to another.


Our goal is: in the final output, the word boundary (the highest-level X° in the parse tree) should be as accurate as possible, while the internal structure serves as a bridge for the resource sharing with other systems.

### 1.4 Treatment for unclear cases

There are two types of unclear cases:

- A construction is easy to identify but there is no consensus on its treatment. 
  Ex: A-not-A, V-de construction, V-R, potential form (i.e., V-de-R).Our approach: we will choose one analysis, and annotate the data according to that analysis. Make sure that the annotation is easy to convert to the structures for other analyses if necessary.
- Two constructions are difficult to tell apart by existing tests.
  Ex: some N+N are compounds, others are phrases.


Our approach: for the sake of consistency and efficiency, we don^ disambiguate the two constructions unless making the distinction is crucial for various reasons.

### 1.5 Organization of this guidelines


The guidelines are organized according to the internal structure of the corresponding expressions (e.g., a verb resultative compound is represented as V+V, while a verb-object expression is as V+N), so it is easy for the annotators to search the guidelines for reference. The Part-of-speech tags used in this paper are identical to the ones used in the POS tagging task except that the tags for verbs are merged into V and the ones for nouns are merged into N. For the descriptions of the complete POS tagset, please refer to our Part-of-Speech Tagging Guidelines for the Penn Chinese Treebank (3.0). The list of POS tags can be found in Appendix B.


In this guidelines, we list mainly the decision for each case without going into detail elaborating other alternatives and the reasoning behind each decision.

Chapter 2 Specification
---------


In this chapter, we assume that a sentence has been segmented into large chunks, and the next step is to decide whether each chunk should be further divided. The chapter is arranged by the potential POS of the chunk if the chunk is a word. To search through the section, first use the ^POS^ of the chunk to find the subsection, then use the ^word^ formation information to find the subsection; or simply use the “word” formation information.

### 2.1    Common noun: NN

#### 2.1.1    Name of relative


Treat it as one word.


Ex:三叔[uncle]/NN，表叔[uncle]/NN,.大姑父[uncle]/NN.

#### 2.1.2 CD+N


If a measure word can be inserted between CD and N without changing the meaning, tag it as CD+N; otherwise, tag it as one word (N).


One word:三排[the third platoon]/NN，一方[one side]/NN,三者[three entities]/NN, 一行[a group traveling together]/NN，2 1 世纪[the 21st century]/NT.


Two words: — [one]/CD 学生[student]/NN.

#### 2.1.3 DT+N


Treat it as one word if both DT and N are monosyllabic and either DT or N is bound; otherwise, treat it as two words.


Sometimes, it is difficult to decide whether a morpheme is bound or not because of the influence of non-Modern Chinese. To be consistent, we maintain a list of nouns and a list of determiners. If a morpheme is in one of the lists, we consider it as bound:


- monosyllabic bound nouns: /^.[school], ^ (when it means the earth).


- monosyllabic bound determiners:当[this/that]


We also treat 本人[oneself]/NN as one word and tag it as NN.


One word:本人[oneself]/NN,本校[our school]/NN,全球[whole world]/NN,当地[the place mentioned]/NN,当今[present time]/NT,当代[the contemporary era]/NN.


Two words:本[one’s]/DT 单位[organization]/NN.

#### 2.1.4    PN+N


Treat it as one word if both PN and N are monosyllabic and N is bound; otherwise, treat it as two words.


In this case, the current list of bound nouns is:校[school].


One word:我校[my school]/NN.


Two words:我[my]/PN 单祆[organization]/NN.

#### 2.1.5    JJ+N


The pattern is: X+N, where X modifies the N, and X is either a JJ or a prefix.


Note: JJ+N can be a phrase. For example, in one of the files we annotated,全国性[nationwide]/JJ 网络[network]/NN is extended into “全国性[nationwide]/JJ 观测[observe]/VV 苏梅克一列桌/NR 9 号[number 9]/NN 彗星[comet]/NN 撞击[hit]/W 木星[Jupiter]/NN 的/DEC 网络[network]/NN”.


Segment X+N according to the type of X:


- X is a prefix: treat X+N as one word.[1](#bookmark93) A list of prefixes:啊，非[non-].


Ex:啊爸[father]/NN,非商业化[non-commercial]/JJ 宗旨[purpose]/NN.


A list of JJs:原[former],前[former]


Ex:原[former]/JJ 在[at]/P 华[China]/NR 老挝[Laos]/NR 难民[refugee]/NN;


前[former]/JJ 民主德国[German Democratic Republic]/NR.


- X is a non-predicate adjective:[2](#bookmark94) if both JJ and N are monosyllabic, tag it as one word; otherwise, treat it as JJ+N.


One word:女人[woman]/NN.


Two words:共同[mutual]/JJ 利益[interest]/NN.


- X is an adjective: treat it as one word if X or N is bound or the meaning of X+N is non-compositional. For unclear cases, if both JJ and N are monosyllabic, treat JJ+N as one word (e.g” 鲜花[fresh flower]/NN,强队[strong team]/NN, •红茶[black tea]/NN,好评[favorable comment]/NN).


One word:小媳妇[daughter-in-law]/NN,大洲[continent]/NN,大海[sea]/NN.


Two words:厚[thick]/JJ 书[book]/NN.

#### 2.1.6    LC+N


If both LC and N are monosyllabic, treat the string as one word, and tag it as NN or NT according to its meaning.


Ex:前院[front yard]/NN,前天[day before yesterday]/NT,左肩[left shoulder]/NN.

#### 2.1.7    N+LC


Treat N+LC as one word if:[3](#bookmark95)


- the N and LC are monosyllabic; and


- in this context, the N is non-referential or bound; and


- in this context, the N can not be modified by Det-M or other modifiers.


Otherwise, treat it as two words.


- One word (some of them might be two words in other context):室内[indoor](室内[indoor]/NN 训练[training]/NN),台下[off stage],眼前[at present],境外[foreign](境外[foreign]/NN 集团[group]/NN 境内外[domestic and international /NN,海外[oversea](海外[oversea]/NN 市场[market]/NN)，背后[at the back]/NN,天下[world]/NN,国内[domestic]/NN,午后[afternoon]/NT,赛前[before the contest]/NT.
- Two words:中午[noon]/NT 以后[afterwards]/LC.

#### 2.1.8    N+N: N1 modifies N2


If it is 1-hl or 2+1 (i.e., N1 has one or two hanzi and N2 has one hanzi), treat N1+N2 as one word (i.e.，we treat all monosyllabic nouns as potential “接尾词. If a noun with no more than 2 hanzi is followed by multiple    "接尾词"    monosyllabic noun attaches to the preceding    the whole string is treated as one word (e.g•，物理学家[physicist]/NN).


For other cases, the string is treated as two words.


- One word:北京市[Beijing]/NR,研究室[research lab]/NN,发展史[developmental history]/NN,始祖鸟[proto-bird]/NN, 残疾人[the physically challenged]/NN, 清晰度[visibility]/NN, [sense of urgency]/NN, 大奖赛[tournament]/NN,太阳系[the solar system]/NN.
- Two words:北京[Beijing]/NR 大学[University]/NN,坑具[toy]/NN 工厂[factory]/NN,合作[collaboration]/NN, 领城[area]/NN,史学[history]/NN 研究[research]/NN.

#### 2.1.9    PN+LC


If both PN and LC are monosyllabic, treat PN+LC as one word and tag it as NT or NN.


One word:此间[here]/NN,此前[before this]/NN,其中[among them]/NN,何时[when]/NT.


Two word:这[this]/PN 以后[after]/LC.

#### 2.1.10    V+N


In this pattern, we assume V is VV (For VA+N, please refer to the section for JJ+N) If V modifies N, treat V+N as one word and tag it as a noun.


one word:烤肉[barbecue]/NN，炒菜[stir-fried dishes]/NN,证明信[certificate]/NN,讨论会[symposium]/NN.[4](#bookmark96)

### 2.2 Proper Noun: NR


Currently, if the proper noun is composed of multiple words, we don^ group them.

#### 2.2.1    Personal name


Treat it as one word. Don't give the internal structure unless there is a space between two names (in foreign alphabet).


Ex:张胜利/NR,卡尔[Karl].马克斯[Maxx]/NR, John/NR Smith/NR.

#### 2.2.2    Personal name with affixes


Treat it as one word.


Ex:老张/NR,张老/NR

#### 2.2.3    Personal name + title


Treat it as two words.


Ex:张/NR 教授[professor]/NN,张/NR 李/NR 两[two]/CD 位/M 教授[professor"^

#### 2.2.4    Name of Organization/Country/School/..


If the pattern is N1+N2, where N2 is a common noun, then if N2 is monosyllabic, treat N1+N2 as one word, else treat N1+N2 as two words.


Simple names:北京市[Beijing]/NR,黄河[the Yellow River]/NR,沙市[Sha City]/NR,黑龙江省[Heilongji^ Province]/NR.


Complex names:北京[Beijing]/NR.大学[University]/NN,北京[Beijing]/NR 第一[First]/OD 服装厂[Clothing Factory]/NN，美国[the United States]/NR 国会[Congress]/NN.

#### 2.2.5 NR+NR: coordination without conjunction


Treat it as two words.


Ex:中[China]/NR 美[the United States]/NR,中[China]/NR 美[the United States]/NR 关系[relation]/NN, 东[Eastern Asia]/NR 新[Singapore]/NR 澳[Macao]/NR.

### 2.3 Temporal noun: NT


The names of years/months/day/hour and so on axe words.


Ex: 1998年[1998]/NT 3月[March]/NT 21 日[21st]/NT, 5点钟[5 o’clock]/NT，初一[the first day of a lunar month]NT，i年[last year]/NT.

#### 2.3.1 CD+N


If CD+N is the name of a time, treat it as one word (NT). If it is the count of the time, treat it as two words (CD+M).


One word: 1998年[1998]/NT, 5点钟[5 o，clock]/NT, 9 0 年代[the 90s]/NT,


Two words: 3/CD 年[year]/M, 3/CD 个/M 月[month]/NN.

### 2.4 Localizer: LC


Localizers are separated from the noun that it attaxJies to except for the case mentioned in Section 2.1.7 (i.e., N+LC).


A localizer is either one or two syllables:


- monosyllabic localizers: e.g.内[in],后[after].


- bisyUabic localizers: e.g.之间[between],以来[since],以后[afterwards],左右[around].

### 2.5 Pronoun: PN


Treat it as one word.


Ex:他们[they]/PN,他自己piimself]/PN，自己[self]/PN.

### 2.6    Determiner: DT


We separate DTs from the succeeding words.


Ex:这[this]/DT 三[three]/CD 个/M 人[people]/NN,各[each]/DT 国[nation]/NN.


Currently, we treat 这些[these] as one word, and tag it as DT.


Some examples of bisyllabic DTs:全体[all]，其余[the rest], —切[all],这些[these],那些[those],所

### 2.7    Cardinal number: CD


Treat it as one word. Note: the internal structure of a CD is very easy to recover if needed.


Some examples:


- Pure numbers: 一亿三千万[one hundred and thirty million]/CD, 30.1/CD, 123,456/CD, 35.6%/CD, 30万[three hundred thousand]/CD, 30几[thirty odd]/CD.


- Estimation:三四十[between thirty and forty-nine]/CD 岁[years old]/M.


- CD + X + CD(5.5.4): X is a morpheme such as 余[odd],分之[fraction]，点[point]•三十几亿[three billion odd]/CD,三分之一[one third]/CD,三点一[three point one]/CD,好几[multiple]/CD 个/M.


- CD+X: X is a morpheme such as 余[odd],来[over/odd]:四千一百余[four thousand and one hundred odd]/CD 人[people]/NN,三十雇[about thirty]/CD 个/M.

### 2.8 Ordinal number: OD


Treat it as one word.


Ex:第一[first]/OD,第三十一[thirty-first]/OD.

### 2.9 Measure word: M


Treat the measure word, including a reduplicated or a compound measure word, as one word. Treat the string such as 分钟[minute] as one word.


Ex:杯[cup]/M,杯杯[cup-cup]/M,架次[number of flights]/M,分钟[minute]/M.

### 2.10 Verb: VA, VC, VE, and VV

#### 2.10.1 Reduplication: A A, ABAB, A ABB, A AB, ABB，ABAC


Treat it as one word.


- AA, A is a verb: AA/V 
  Ex:看看[see]/W，红红[vivid red]/VA.


- ABAB: AB is a verb: ABAB/V
  Ex:研究研究[research]/VV,雪白雪白[snow white]/VA.


- AABB, AB is a verb: AABB/V
  Ex:来来往往[come and go]/W,高髙兴兴[happy]/VA Note: most of the time, AA or BB is not a word.


- AAB(except for AA-看 in 2.10.2):AAB/V 
  Ex:蒙蒙亮
  Note: most of the time, AA or B is not a word.


- ABB: ABB/V
  Ex:绿油油[bright green]/VA，红彤彤[bright red]/VA.
  Note: most of the time, A or BB is not a word.


- ABAC, etc.: ABAC/V
  Ex:马里马虎[careless]/VA,有条有理[orderly]/VA，一清二楚[very clear]/VA.

#### 2.10.2 “Reduplication”： AA-kan, A-one-A, A-le-one-A，A-le-A


Treat it as one word with internal structure.


- AA-看：（AA/V 看/V)/V
  Ex:(说说[say]/W 看/VV)/V.
  The basic meaning of the word 看 is to “see”，but in this context，it roughly means "try to do something".


- A-one-A: (A/V one/CD A/V)/V 
  Ex:(想[think]/W — [one]/CD 想[think]/VV)/V.  

- A-le-A: (A/V le/AS A/V)/V 
  Ex:(想[think]/W 了/AS 想[think]/W)/V.


- A-l^on^A: (A/V le/AS one/CD A/V)/V 
  Ex:(想[think]/W 了/AS — [one]/CD 想[think]/W)/V.


Note: V+CD+M is treated as three words, e.g. [look]/V [one]/CD [eye]/M (take a look).

#### 2.10.3 A-not-A

Treat it as one word with internal structure.

Ex:(来[come]/VV 没[not]/AD 来[come]/VV)/V，（高[happy]/VA 不[not]/AD 高兴[happy]/VA)/V， (喜[like]/VV 不[not]/AD 喜欢[like]/VV)/V.

#### 2.10.4 AD+V


If one or more of the following hold, treat AD+V as one word (V):


- no free word can intervene between AD and V,


- the V cannot be a predicate without the AD,


- the subcategorization frame of AD+V is different from that of the V.


Otherwise, treat it as two words.

- One word:胡说[talk nonsense],胡来[mess things up],敬献[present with great respect],尚余[remain]
  [(尚余[still remain]/VV 七十五[75]/^D 名)M 难民[refugee]〉NN)，历任[have served successively as],并列[tie处 不喪[not afraid o月.

- Two words:已经[already]/AD 采取[take]/VV,不[not]/AD 应该[should]/VV，没[not]/AD 完成[complete]/VV.

#### 2.10.5 MSP+V


If the V can not be a predicate without the MSP, treat MSP+V as one word (V).


One word:以期[in order to]/W (以期[in order to]/W 在[at] 与[with] 美国[the United States]、 瑞典[Sweden]、挪威[Norway]、这些 [these]、世界[world]、强队[strong teams] 、交锋[competition] 、中[during]...).

#### 2.10.6 N+V


Some subject-predicate strings Coin be either a phrase or a word depending on the context.


If a VP-modifier can be inserted between the subject and the predicate part and the “subject” is referential, then the string is a phrase, otherwise it is a word.


One word:头疼[headache]/VA in “他[he]/PN 让[make]/VV 我[me]/PN 很[very]/AD 〈He gives me a headache}”.


Two words:头[head]/NN 疼[ache]/VA in “我[I]/PN 头[head]/NN {很[very]/AD}疼[ache]/VA〈I have a headache}’’.

#### 2.10.7 V+N


If the V and the N axe separated (by the aspect markers, by the modifiers of the N, or because the V is reduplicated), treat V+N as two words.


If the V and the N are adjacent,[6](#bookmark98)


- If V-N is semantically transitive and its object can occur after N only when VN are adjacent (therefore the V is not a ditransitive verb)，treat V+N as one word (e.g.,投资[invest]/VV, 出席[be present]/W,关心[care]/VV,为期[scheduled for a specific duration of time]/W).


- If V and VN have similar meaning and both axe semantically intransitive, treat VN as one word (e.g.,睡觉[sleep]/VV).


- If N is “bound”, treat VN as one word (e.g.,游泳[swim]/VV,无望[hopeless]/VV,无效[invalid]/VV, 无法[unable to]/VV,辞职[resign]/W).


- If V-N is 1+1 AND the meaning is non-compositional，treat V-N as one word (e.g.,念书[study]/VV, 流血[bleed]/VV).  


Examples of V-N as two words:访[visit]/VY 华[China]/NR in the sentence 他[he]/PN 曾[previously]/AD 七[seven]/CD 次[time]/M 访[visit]/W 华[China]/NR〈He has visited China seven times、

#### 2.10.8    V+R


The tests for verb resultative compounds (V-Hs): both V and R are verbs and the potential forms (V-de-R, V-not-R) exist. So our definition of V-R includes resultative and directional verb com-pounds (e.g.，看见[see] and 走上来[walk up])，but it does NOT include words such as 改善[improve] and 鼓动[agitate].

- We treat it as one word. For the sake of compatibility with other guidelines, we give the internal structure for the words if they have more than 2 syllables or if the R is the following:完[finish]/W.

- Words without internal structure:吃掉[eat up]/VV,看见[see]/W，擦净[wipe clean]/VV.

- Words with internal structures:(做[do]/VV 完[finish]/W)/V,(擦[wipe]/VV 干净[clean]/VV)/V, (认识[realize]/W 到[reach]/VV)/V.

#### 2.10.9    Potential form: V-de/bu-R


We treat it as one word.

- If V-R exists, give the internal structure of V-de/bu-R, otherwise, don^ give one.
  Ex: words with internal structure:(擦[wipe]/VV 不[not]/AD 冷[clean]/VA)/V，（擦[wipe]/VV 得/DER 净[clean]/VA)/V.    "

- words without internal structure:吃不了 [unable to eat anymore]/W，买不起[cannot afford]/VV.


Note: the string WV de R,? can be ambiguous between potential form and V-de construction. For example, “这[this]张[M]桌子[table]擦[wipe]得pER]干净[clean]吗[SP]?’’ can either be a potential form (which means Can this table be wiped clean?), or it could be a V-de construction (which means Has the table been wiped clean?). The two constructions have different syntactic structures. Normally, we can tell them apart by meaning, by the position of the object or by checking whether adverbs can be inserted between the de and the R.

#### 2.10.10    V+DIR


See Section 2.10.8 (i.e., the section for V+R).


Words with internal structure:(走[walk]/VV 出去[out]/VV)/V,(走[walk]/VV 不[not]/AD 出去[o叫 Words without internal structure:走出[walk out of]/VV,想出[think of]/VV.

#### 2.10.11    V+AS


Treat it as two words.[7](#bookmark99)


Ex:走[walk]/VV 了/AS.

#### 2.10.12 V+DER


The pattern is V-de in V-de construction. We treat V-de as two words.[8](#bookmark100) Ex:走[walk]/VV 得/DER (走[walk]/W 得/DER 很[very]/AD 快[fast]/VA).

#### 2.10.13 Verb coordination without conjunctive words


If the pattern is 1+1, treat it as a word; otherwise, treat it as multiple words.


One word:修建[build]/VV.


Two words:宣传[propagate]/VV 鼓动[agitate]/VV.

#### 2.10.14 V+coverb


The pattern is V+X, where X is monosyllabic and it is either a P or a V.[9](#bookmark101)

- We first decide whether V+X is a word. If it is, we use its syllable count to decide whether to show its internal structure. That is, if V is monosyllabic, don^ give the internal structure;


otherwise, give the internal structure.


- treat V+X as one word if X is in the following list:给[give];为[become],成[become]，作[treat as],到[arrive],出[out];自[from],向[toward],入[in],以[with].
  Ex:

  - 给[give]:送给[give/send to]/VV，交给[hand in]/VV，（赠送[give as a gift to]/VV 给[give]/VV)/V.
  - 为[to],成[become/into]，作[do/as],到[arrive],出[out]:(翻译[translate]/VV 成[become] 当作[treat as]/VV,起到[take effect]>V，找到[find]/VV,(认识[realize]/VV 到[reach]/VV)/V,决出[decide victors]/VV.
  - 自[from],向[toward],入[in],以[with]:来自[come from]面向[face toward]/ into]/VV,迈向[step toward]，VV,报以[respond with]/VV，加以[supplement with]/VV.


- treat V+X as two words if X is in the following list:在[at]，似[like].

  - Ex:生[to be born]/W 在[at]/P，坐[sit]/W 在[at]/P，留[stay]/W 在[at]/P，深[deep]/VA 似pike]/P 海[sea]〉NN.

-  treat V+X as one word or two words (V+P) according to the meaning of the X, if X is in the following list:于[at].

  - If 于 in V + 于 can be replaced by 在[at], tag V+于 £us two words (V+P). Otherwise, tag it as one word.
  - One word:等于[equal to]/VV,缘于[due to]/VV,大于[bigger than]/VV,小于[smaller than]/VV, 无助于[of no help to]/VY 低于[lower than]/W,利于[be beneficial for]/W,有利于[be beneficial for]/VV.
  - Two words:生[to be born]/W 于[at]/P,建[build]/VV 于[at]/P.


#### 2.10.15 Others

Generally, in X+V(or V+X) where X modifies V, if X cannot modify other verbs, or V cannot be a predicate without the X, treat X+V as one word.

- Ex:以期[in order to]/W

### 2.11 Adverb: AD


Adverbs are separated from the XP that it modifies.


Adverbs that modify numbers:近[almost]/AD 三十[thirty]/CD，5[five]/CD 分[minute]/M 多[odd]^ 钟[minute]/NN.[10](#bookmark102)


The string such as fe^[extremely big] is an adverb when it modifies VPs, not AD+VA, because the VA(大[big]) cannot modify VPs without the AD(极[extremely]).

#### 2.11.1    Reduplication


When VA(or AD) reduplicates, the resulting word can be an AD.


Ex:妤好[well]/AD 干[do]/W，常常[always]/AD,仅仅[only]/AD.

#### 2.11.2    DT+M/N


The following are tagged as ADs when they modify VP/S:这样[this way]/AD (这样[this way]/AD 做[do]/W),同机[on the same airplane]/AD (同机[on the same airplane]/AD 到达[arrive]/W).

#### 2.11.3    P+PN


We treat the following as two words:为[for]/P 此[this]/PN.

#### 2.11.4   P+N


The following can be seen as frozen PPs. Since they have the same function as the ADs, we treat them as words, and tag them as ADs:迄今[until now],沿途[on the way],即席[impromptu]， 为何[why](为何[why]/AD 愈演愈洩[get worse and worse]/VA),为什么[why]/AD 来[come]/VV

#### 2.11.5 PN+LC


If a PN+LC totally loses the function of an NP and the string acts like an adverb, treat it as an adverb.


We treat the following as ADs:此外[in addition]/AD.

#### 2.11.6 Others


If in that context a string totally loses the function of the XP(where X is the head of the string) and the string behaves like an adverb, tag it as AD.


We treat the following as ADs:进一步[a step further]/AD.

### 2.12 Preposition: P


Separate it from NP/S that follows it.


Most prepositions are monosyllabic. Some common bisyllabic prepositions are:为了 [in order to],随着[along with],沿着[along],本着[in conformity with],鉴于[due to],除了[except],经过[through]，


作为[being/regard as],截止[until].


When a coverb follows a verb, we have to decide whether the word is part of a verb compound. A list of such coverbs are:于，给，为， See Section 2.10.14 for details.

### 2.13 Subordinating Conjunction: CS


Separate it from the XP that follows it.


Strings such as 只有[only] is ambiguous:


- CS:只有[only if]/CS ...才[then]/AD ....


- AD+VE:他[he]只[only]/AD 有[have]/VE 三[three]/CD 块/M 钱[money]/NN〈He only has three dollars).

### 2.14 Conjunction: CC


Separate it from the XPs that it conjoins.


Ex:和[and]/CC,与

### 2.15 Particle: DEC, DEG, DEV, DER，AS, SP，ETC，and MSP


Separate it from the XP that it attaches to.[11](#bookmark103)


Most particles axe monosyllabic. One of bisyllabic particles is 的话[if so]/SP.

### 2.16 Interjection: IJ


Treat it as one word.


Ex: 哈[expressing satisfaction and so on]/IJ.

### 2.17 Onomatopoeia: ON


Treat it as one word.


Ex:哈哈[sound of laughter]/ON,哔啦啦[sound of water/rain]/ON

### 2.18 Other noun-modifier: JJ


Separate it from the measure word (M) or the noun (N) that it modifies. Ex:三[three]/CD 大[big]/JJ 杯[glass]/M 水[water]/NN


"When JJs modify nouns, the JJs can be adjectives,区别词(非谓形容词)，or “phrasal words”. Most of the <4phrasal words,? have two parts: X+Y, both X and Y are monosyllabic, and X or Y is the short-form of the corresponding words. Some examples of the "phrasal words" are as follows:

#### 2.18.1 V+N


V+N:随军[being with the army]/JJ.妓女[prostitute]/NN,旅英[having studied in England]/JJ 学者[scholar]/N^ 成套[forming a complete set]/JJ 设备[equipment]/NN,.发稿[sending manuscripts to press]/JJ 时间[time]/NN， ^^[receiving award]/JJ #i[scholar]/NN, 驻华[being stationed in China]/JJ  使馆[embassy]/NN, ^4[giving benefit]/JJ 国家[nation]/NN，

#### 2.18.2 AD+VA


AD+VA:最新[the newest]/JJ 消息[news]/NN,超大[extra-large]/JJ 规模[scale]/NN 集成[integrate]/NN 电路[circuit]/NN，较大[relatively big]/JJ 增长[growth]/NN.


The common “AD”：最[the most],超[extra-],较[relatively].

#### 2.18.3 VA+N


VA+N/M:高层[high-ranking]/JJ 人士[official]/NN,高速[high speed]/JJ 公路[highway]/NN，大幅[big size]/JJ 标语[slogan]/NN.

#### 2.18.4 CD+N


CD+N/M:两国[two~nation]/JJ 关系[relation]/NN，多国[multi-nation|/JJ 部队[troop]/NN

#### 2.18.5 P+N


P+N/LC:对外[foreign]/JJ 政策[policy]/NN

#### 2.18.6 Others


others:关贸[tariff and trade]/JJ 总协定[treaty]/NN,年均[annual average]/JJ 增长率[growth rate]/NN, 上述[aforementioned]/JJ 三[three]/CD 国[nation]/NN,历届[all previous sessions]/JJ 世界[world]/NN 体操[gymnastics]/NN 大赛[championship]/NN,有关[related]/JJ 方面[parties]/]S[N.

### 2.19 Punctuation: PU


Treat it as one word, except when it is part of another word; for example, 4V? in a number (e.g., 123,456/CD) or in proper names，（e.g.，卡尔[Karl].马克斯[Marx]/NR).

### 2.20 Foreign word: FW


Treat it as one word, except when it is part of another word (e.g.,    [Karaoke]/NN).

### 2.21 Others

#### 2.21.1 Idioms


The frozen idioms (成语）axe treated as words when they function as an NP or a VP.


Ex:各有所好[each has his likes and dislikes]/V, 一比高低[compete]/V.

#### 2.21.2 Telescopic strings


Telescopic strings are treated as one word if they are not too long (less than four characters). K it is too long, segment them according to pauses.


Short strings:进出口[imports and exports]/NN 贸易[trade]/NN,国内外[foreign and domestic]/NN 形勢[situation] /NN.


Long strings:交响[symphony]/JJ 乐团[orchestra]/NN，北京[Beijing]/NR 市长[mayor]/NN.

#### 2.21.3 Short form


Ex:三好[three-merit]/JJ 学生[student]/NN，教科文[education，science,紐d culture]/NN 组织[organization] (UNESCO),七中[the seventh central government]/NN 全会[convention]/NN.

  
Shortened part is treated as one word. If the shortened part is longer than 3 syllables, segment them according to phonologic evidence (e.g., pauses). The structure of the short form might be different from that of the full form.


Chapter 3 Collocation with Some Morphemes
---------

### 3.1 Strings with zhe5


Some prepositions end with 着.


Ex:随着[along with]/P.

### 3.2 Strings with zhi1


zhi+LC, where LC is monosyllabic, is treated as one word (LC).

- Ex:之外[aside from]/LC,之中[among]/LC.
- zhi1+CD is treated as DEG+CD (e.g.,方法[method]/NN 之/DEG 一[one]/CD,方法[method]/NN 之/DEG 三[three]/CD).


For simplicity,之一 in a sentence such as 中国是发展中国家之一 is treated as one word and tagged as an NN.


zhi1+N is treated as DEG+N (e.g.,少年[Children]/NN 之/DEG 家[Club/Center]/NN).

### 3.3 Strings with bu4


If X in X+不[not] (or 不[not]+X) must co-occur with bu4 or the meaning of X+不[not] is not com-positional, we treat X+bu4 as one word.


Words that include bu4(不[not]):不到[less than](不到[less than] 5 分钟[minutes],不足[less than] (不足[less than] 5 公斤[kilogram])，不便[inconvenient]，不久[not before long].

### 3.4 Strings with shi4


For simplicity, we treat 特别是[particularly]/AD as one word.

### 3.5 Strings with xiel


The following axe treated as one word:    [these]/PN(or DT),    [some]/CD.

### 3.6 Strings with you3

V+有[have] is often a verb; for example,刻有[engraved with]/VV,真有possess]/VV,富有[rich]/VV. 

mei2you3(没有) is always treated as one word(VV or VE or SP).


Many idioms include the word 有[have]; for example,若有所思[as if lost in thought]/W.


The following are two words:有[have]/V 所/MSP,仅[only]/AD 有[have]/V,有[have]/V 可能[possibility]/NN.


The following are ambiguous without the context:


- you3-dian3(有点)：V[have]+M or AD[a little bit]
  It is V+M when 点 can be dropped or replaced by 一点[a little bit].
  you3-dian3 is an AD when it can be replaced by other degree adverbs such as ^[very] or when it is followed by a VP.

  - 他[he]/PN 有点[a little bit]/AD 下不了 [unable to get off]/VV 台[stage]/NN〈He felt embarrassed}.
  - 这[this]/DT 本/M 书[book]/NN 有[have]/V 点/M 意思[meaning]/NN〈This book is interesting〉.
  - 这[this]/DT 本/M 书[book]/NN 有[have]/V 点/M 看头[worth reading]/NN〈This book is worth reading).


- you3-de5(有的)：V[have]+DEC or DT[some]

  - 他[he]有[have]/V 的/DEC 书[book]我[I]也[also]有[have]〈The books that lie has, I have, too〉.
  - 有的[some]/DT 人[people]已经[already]走[leave] 了[AS]〈Some people have already left〉.


- you3-xie1 (有些)：V[have]+M or DT[some]:

  - 我[I]只[only]有[have]/VV 些[some]/M 旧书[old books]〈I only have some old books.}
  - 他[he]不[not]像[like]有些[certain]/DT 人[people]专门[especially]爱[like]抬杜[argue]〈沿 like certain people who especially like to argue).


- zhi3-you3(只有)：AD[only]+V[have] or CS[only if]:

  - 你[you]只有[only]/CS 学习[learn]才[then]/AD 能[able to]改进[improve]工作[work]〈You can only improve your work by learning).
  - 他[he]只[only]/AD 有[have]/VV 10 块[M].钱[dollars]〈He only has ten dollars〉.


### 3.7    Strings with zai4


One word:正在[in the process of]/AD.

### 3.8    Strings with zi4ji3


Always treat PN+zi4ji3 (自己[self]) as one word. Ex:他自己/PN.

Chapter 4 Common Collocations
---------

### 4.1 As one word


- AD:迄今为止[皿til today]，迄今[皿til now]，进一步[one step further],越来越[more and more],同机[on the same airplane]，沿途[on the way],即席[impromptu].


- DT:这些[these].


- JJ:对外[foreign] (e.g.,对外[foreign]/JJ 政策[policy]/NN),各界[all circles]/JJ.


- LC:之间[between]，在内[inside].


- NN:其中[among them], —行[group traveling together].


- P:为了[in order to].


- V:来自[come from],面向[face toward],流入[flow in],迈向[step toward],报以[respond with],为期[scheduled for a specific duration of time]，有利于[be beneficial for].

### 4.2 As two words


- AD-like:并[yet]/AD 未[not]/AD.


- CC-like:及[and]/CC 其[his/its/her]/PN，而[and]/CC 又[in addition]/AD.


- DT-like:各[each]/DT 个/M.


- NN-like:超大[extra-large]/JJ 祝模[scale]/NN，我[our]/PN 国[nation]/NN.


- NT-like:零点[midnight]/NT 零一分[one]/NT〈one minute past midnight〉.

### 4.3 Other cases


V-V:(迎上[step forward]/W 前去[go forward]/VV)/V.

Appendix A Comparison with Other Guidelines
----------


In this appendix, we compare our guidelines with the guidelines from PRC [LTS93] and from Rocling [Chi96]. The grouping of words in our system is done in bracketing stage.


|  | Ours | PRC | Rocling | Example |
| --- | --- | --- | --- | --- |
| Verb |  |  |  |  |
| AA | AA | AA | AA | 看看 |
| ABAB | ABAB | ABAB | ABAB | 研究研究 |
| AABB | AABB | AABB | AABB | 高高兴兴 |
| ABB | ABB | ABB | ABB | 绿油油 |
| AAB(excl AA-看） | AAB | AAB | AAB | 蒙蒙亮 |
| ABAC etc. | ABAC | ABAC | ABAC | 有条有理 |
| AA-看 | (AA/V kan/V)/V | AA kan | AA kan |  |
| A-yi-A | (A/V yi/CD A/V)/V | AyiA | AyiA | 走一走 |
| A-l^A | (A/V le/AS A/V)/V | A le A | A le A | 走了走 |
| A-le-yi-A | (A/V le/AS yi/CD A/V)/V | A le yi A | A le yi A | 走了一走 |
| nonreduced A-not-A | (A/V not/AD A/V)/V | A not A | A not A | 喜欢不喜欢 |
| reduced A-not-A | (A/V not/AD A/V)/V | A-not-A | A-not-A | 喜不喜欢 |
| V-R(R is monosyl.) | v-r except v/V 完/V | v-r | v-r | 打破 |
| V-R(R is bisyl.) | (v/V r/V)/V | v r | v r | 扫千净 |
| V-de/bu-R | (v/V de/DER r/v)/V | v de r | v de r | 打得破 |
| (V-R exists) | (v/V bu4/AD r/v)/V | v bu r | y bu r | 打不破 |
| V-de/bu-R | y-de-r/V | ?? | y-de-r | 来得及 |
| (V-R doesn’t exist) | v-bu-r/V | ?? | y-bu-r | 来不及 |
| V-DIR | (v/V dix/V)/V | v dir | v-dir | 走上来 |
| V-x-0 | v/V x/X o/N | v x n | v x n | 吃了饭 |
| VO | depends | depends | depends | 关心，吃饭 |
| V-de | y/V de/DER | v de5 | v de5 | 走得 |
| V-AS | y/V as/AS | v as | v as | 走了 |

**Table A. 1: Comparison with PRC’s and Rocling’s Guidelines**


|  | Ours | PRC | Rocling | Example |
| --- | --- | --- | --- | --- |
| Nouns
| Proper Names (NR) 
LstNm+Fst Nm | one seg | two segs | one seg | 王鸣 |
| IstNm+title | name/NR title/NN | name title | name title | 王市长 |
| NR +接尾词 | nr-nn/NR | depends | nr-nn | 北京市 |
| NR + common noun | nr/NR nn/NN | nr nn | nr nn | 北京大学 |
| complex names | several segs | depends | several segs | 北京第一服装厂 |
| Common nouns N+men5 | one seg | one seg | two segs | 学生们 |
| VA+N | depends | depends | depends | 小媳妇 |
| N+N | depends | depends | depends | 牛肉 |
| Temporal nouns name of time | cd-year/NT | cd year | cd-year | 1998年 |
| count of time | cd/CD year/NN | cd year | cd year | 3年 |
| DP-related
CD | one seg | ?? | one seg | 一万三千 |
| CD+X+CD | one seg | several | one seg | 三分之一 |
| AD + CD | ad/AD + cd/CD | ad cd | ad cd | 约三百 |
| CD + X | cd-X/CD | cdX | cd-X | 三百多 |
| di4-CD | di 娈 cd/OD | di4 cd | di4-cd | 第一 |
| CD+M | cd/CD m/M | cd m | cd m | 这个 |
| M + M | m-m/M | m-m | m-m | 片片 |
| yi1+M+M | yi1/CD m-m/M | yi1 m-m | yi1-mm | 一片片 |
| yi1-M-yi1-M | yi1/CD m/M yi1/CD m/M | ?? | yi1 m yi1 m | —^^~-个 |
| Markers
V-AS | v/V as/AS | v AS | v AS | 打了 |
| V-de | v/V de/DER | v de5 | v de5 | 走得 |
| SP | one seg | one seg | one seg | 吗 |
| de5(的，地） | one seg | one seg | one seg | 我的，高兴地 |
| zhi1(之)+CD/N | two segs | two segs | two segs | 方法之三 |
| zhi1(之)+LOC | one seg | ?? | one seg |  |
| Others
成语(no insertion) | one seg | one seg | one seg | 鼠目寸光 |
| ACROM | one seg | one seg | one seg | 北大 |

**Table A.2: Comparison with PRC and Rocling^ Guidelines(Ctd)**

Appendix B Treebank Part-of-Speech Tagset
----------


The following is the Part-of-Speech Tagset used in our Penn Chinese Treebank.


|  |  |  |
| --- | --- | --- |
| AD | adverb | 还 |
| AS | aspect marker | 着 |
| BA | 把 in barconstmction | 把，将 |
| CC | coordinating conjunction | 和 |
| CD | cardinal number | 一百 |
| CS | subordinating conjunction | 虽然 |
| DEC | 的 in a relative-clause | 的 |
| DEG | associative 的 | 的 |
| DER | 得 in V-de const, and V-de-R | 得 |
| DEV | 地 before VP | 地 |
| DT | determiner | 这 |
| ETC | for words等，等等 | 等，等等 |
| FW | foreign words | ISO |
| IJ | interjection | 啊 |
| JJ | other noun-modifier | 男，共同 |
| LB | 被 in long bei-const | 被^给 |
| LC | localizer | 里 |
| M | measure word | 个 |
| MSP | other particle | 所 |
| NN | common noun | 书 |
| NR | proper noun | 美国 |
| NT | temporal noun | 今天 |
| OD | ordinal number | 第一 |
| ON | onomatopoeia | 哔哔 |
| P | preposition excl.被 and 把 | 从 |
| PN | pronoun | 他 |
| PU | punctuation |  |
| SB | 被 in short bei-const | 被^给 |
| SP | sentence-final particle | 吗 |
| VA | predicative adjective | •红 |
| VC | 是 | 是 |
| VE | # as the main verb | 有 |
| VV | other verb | 走 |

**Table B.l: Our POS tagset in alphabetical order**

Bibliography
------------


[Chi96] Chinese Knowledge Information Processing Group. Shouwen Jiezi - A study of Chinese Word Boundaries and Segmentation Standard for Information Processing (in Chinese). Technical report, Taipei: Academia Sinica, 1996.


[1D92] John Xiang ling Dai. The Head in Wo Pao De Kuai. Journal of Chinese Linguistics, 1992.


[LTS93] Y. Liu, Q. Tan, and X. Shen. Segmentation Standard for Modern Chinese Information Processing and Automatic Segmentation Methodology, 1993.


[Pac98] Jerome L. Packard, editor. New Approaches to Chinese Word Formation, Mouton de Gruyter, 1998.


[SW87] Anna Maria Di Sciullo and Edwin Williams. On the Definition of Word. The MIT Press, 1987.


[XPX+00] Fei Xia, Martha Palmer, Nianwen Xue, Mary Ellen Okurowski, John Kovarik, Shizhe Huang, Tony Kroch, and Mitch Marcus. Developing Guidelines and Ensuring Consistency for Chinese Text Annotation. In Proc. of the 2nd International Conference on Language Resources and Evaluation (LREC-2000)^ Athens, Greece, 2000.


[1](#footnote1)


The difference between a JJ and a prefix is that the latter, not the former, is bound. As mentioned before, sometimes, it is difficult to tell whether a morpheme is bound or not, so we keep a list of morphemes that we regard as prefixes. In this case, if the N in X+N can be replaced with, an NP, we treat X as a JJ, ratter than a prefix.


[2](#footnote2)


 A word is a non-predicate adjective if it can not appear as a predicate after the subject without the help of 是...的.


[3](#footnote3)


N+LC1+LC2, where LC1 and LC2 denote opposite directions, is treated similarly.


[4](#footnote4)


In either of the last two examples, the first morpheme is bisyllabic, and it could be tagged as nouns in some context. Because the second morpheme is mono-syllabic, the expression should be treated as one word regardless of the POS tag of the first morpheme.

[5](#footnote5)


[6](#footnote6)


The V+N combination is among the hardest cases for the word definition. The tests proposed here are not perfect. They tend to treat idiomatic phrases (similar to "kick the bucket" in English) as words. However, Those errors can be easily corrected if later a dictionary becomes available.

[7](#footnote7)


It has been argued that aspect markers are affixes (e.g., [1D92]). Right now, we do not group the V and the AS together.


[8](#footnote8)


The function of de in the V-de construction is controversial. It ranges from an affix, a particle, to a verb. We will not get into details here.


[9](#footnote9)


 Many of Xs in this pattern are ^coverbs^ and it is highly debated which tag, V or P, X should Lave in this pattern and whether V+X forms a word by the process such as reanalysis.


[10](#footnote10)


Note: 50 多分钟 is segmented as 50 多[50\_odd]/CD 分钟/M.


[11](#footnote11)


In the literature(e.g., [1D92]), it has been argued that some of the particles such as 得，了 are  affixes. For the sake of compatibility with other guidelines and also because it is very easy to automatically group these particles with preceding words, we separate the particles from the preceding words.


================================================
FILE: docs/annotations/tok/index.md
================================================
# Tokenization

## Chinese
```{toctree}
ctb
msr
```

================================================
FILE: docs/annotations/tok/msr.md
================================================
# MSR中文文本标注规范 (5.0 版)

[**Tokenization Guidelines of Chinese Text (V5.0)**](http://sighan.cs.uchicago.edu/bakeoff2006/MSRAsamp/msra-spec.pdf)

黄昌宁 李玉梅 朱晓丹

Chang-Ning Huang, Yumei Li, and Xiaodan Zhu

微软亚洲研究院

Microsoft Research Asia

２００6 年 3 月 27 日

March 27, 2006

微软《中文文本标注规范（5.0 版）》

## 第一章 概述

### 1.1 版本说明

微软亚洲研究院《命名实体标注规范》3.0版是为30万词《人民日报》语料的命名实体（NE)标注任务制定的。其英文版‘Guideline on Chinese Named Entity Annotation’成稿于2003年2月，用于LSP(Lexical Service Platform)课题。当时在研究院，命名实体识别（Name Entity Recognition)和自动分词（Word Segmentation)是文本处理中互相独立的两个过程，所以未曾深入考虑分词词表（lexicon)对命名实体标注带来的影响。2005年3月至7月在准备第二届国际自动分词评测（SIGHANBakeoff2005)的237万词训练语料的过程中修订了该规范，形成4.0版。《命名实体标注规范》4.0版的一个最大特点是把命名实体识别有机地融入到中文自动分词的整体过程中去。因此，除了命名实体自身的定义以外，还需要系统地阐明词表词和各类实体之间的复杂关系。本规范是在微软亚洲研究院《命名实体标注规范》4.0版的基础上编制的。由于规范实际上涵盖了文本中词语和各类实体的标注规则与实例，所以更名为《中文文本标注规范》(Tokenization Guidelines of Chinese Text) 5.0版。

### 1.2导读

规范的第一章（概述)、第二章（专有名词标注总则）、第六章（数字串标注总则）以及第九章（分词歧义消解细则）是每个标注人员必读的材料。其它章节收集了大量的实体标注规则与实例，用以补充各类实体定义的不足。凭借这些具有上下文信息的词例化实例可以进一步提高文本标注的精度和一致性，所以它们是供标注人员经常查阅的参考资料。诚恳欢迎读者对本规范和带标语料中的错误提出宝贵意见，以便及时更正。批评和意见请寄[黄昌宁](mailto:cnhuang@msrchina.research.microsoft.com)。

### 1.3标注格式

format-1是面向标注人员的格式：

/十月九日/上午/ ->/[dat十月九日]/[tim上午]/

format-2是基于XML的标注格式：


/十月九日/上午/ -> `<w><TIMEX TYPE=“DATE" >十月九日</TIMEX></w><w> <TIMEXTYPE=“TIME”>上午</TIMEX></w>`


*TIMEX* 是时间表达式，日期 *DATE* 和时间 *TIME* 是它的两个子类。

考虑到本规范主要是为标注人员编写的，以后的例子主要以第一种格式（format-1）表示。想了解更多 XML 格式的读者，请参见 MET-2 Guideline1。

1MET(MultipleEntityTask)是1997年第七届MUC(Meassage Understading Conference)会议多实体识别任务的简称。MET-2是当年美国NIST公布的命名实体标注规范。可查阅：http://www.itl.nist.gov/iaui/894.02/related_projects/muc/proceedings/ne_task.html

### 1.4命名实体标记集

表1-1是本规范定义的全部命名实体标记，包括专有名词（*NAMEX*）、时间表达式（*TIMEX*）、数字表达式（*NUMEX*）、度量表达式（*MEASUREX*）和地址表达式（*ADDREX*）等类五大类及其下属的三十个子类。

| 大类 | 子类 | Format-1标注集 | Format-2标注集 |
| --- | --- | --- | --- |
| NAMEX | Person | P | PERSON |
| Location | L | LOCATION |
| Organization | O | ORGANIZATION |
| TIMEX | Date | dat | DATE |
| Duration | dur | DURATION |
| Time | tim | TIME |
| NUMEX | Percent | per | PERCENT |
| Money | mon | MONEY |
| Frequency | fre | FREQUENCY |
| Integer | int | INTEGER |
| Fraction | fra | FRACTION |
| Decimal | dec | DECIMAL |
| Ordinal | ord | ORDINAL |
| Rate | rat | RATE |
| MEASUREX | Age | age | AGE |
| Weight | wei | WEIGHT |
| Length | len | LENGTH |
| Temperature | tem | TEMPERATURE |
| Angle | ang | ANGLE |
| Area | are | AREA |
| Capacity | cap | CAPACITY |
| Speed | spe | SPEED |
| Acceleration | acc | ACCELERATION |
| Othermeasures | mea | MEASURE |
| ADDREX | Email | ema | EMAIL |
| Phone | pho | PHONE |
| Fax | fax | FAX |
| Telex | tel | TELEX |
| WWW | www | WWW |
| Postalcode | pos | POSTALCODE |

**表1-1命名实体的标记集**

### 1.5基本原则

#### 1.5.1基本考虑

通用性：尽量遵循国际标准MET-2和ER-992，不同之处在本规范中阐明。

实用性：可用于LSP (Lexical Service Platform), TTS (Text To Speech conversion), IR (Information Retrieval), IE (Information Extraction), QA (Question Answering), IME(Input Method Editor)等应用系统。

#### 1.5.2标注对象

##### 1.5.2.1词表词与未登录词

本规范认为：文本中的任何一个词要么是词表词（LW），要么是未登录词（OOV）。两者都是文本的标注对象。未登录词可以进一步分成命名实体（NE）、词法派生词（MDW）和新词（NW）等三部分。本规范定义的命名实体是未登录词的主体。

（1）命名实体（NE）

命名实体可以进一步分成如下五大类共三十个子类（详见表1-1）：

- 专有名词（*NAMEX*）包括人名（*P*）、地名（*L*）和机构名（*O*）等3种。
- 时间表达式（*TIMEX*）包括日期（*dat*）、时间（*tim*）和时段（*dur*）等3种。
- 数字表达式（*NUMEX*）包括百分数（*per*）、钱款（*mon*）、频度（*fre*）、整数（*int*）、分数（*fra*）、小数（*dec*）、序数（*ord*）和比率（*rat*）等8种。
- 度量表达式（*MEASUREX*）包括年龄（*age*）、温度（*tem*）、角度（*ang*）、长度（*len*）、
- 面积（*are*）、容积（*cap*）、重量（*wei*）、速度（*spe*）、加速度（acc）和其它（*mea*）等10种。
- 地址表达式（*ADDREX*）包括电子邮箱（*ema*）、电话（*pho*）、传真（*fax*）、电报挂号（*tel*）、邮政编码（*pos*）和网址（*www*）等6种。

在标注过的文本中，词的边界一律用斜线（slash）表示。除了词表词以外，每个独立的命名实体（即非嵌入到词表词内部的实体，见1.5.2.3）也被视为一个词，其标注符号及形式详见本规范。

（2）词法派生词（MDW）

以词表词AB的重迭形式AABB和AB/AB为例：

/*转轨*/*哪*/*有*/*像*/*人*/*说*/*得*/*那般*/*轻轻松松*/？

/*积累*/*多*/*了*/*，*/*抽出*/*时间*/*，*/*认真*/*整理*/*整理*/，

（3）新词（NW）

一个新词的左右两侧用符号&标示，其内部的切分符保留3，如：

/&*桑拿*&/*浴*/

/*天时地利*/&*人*/*和*&/*；*/

/[L*罗*]/*货币*/&*列*/*伊*&/

以下是一些真实的例句，例句中的实体标注符号请参阅表1-1。

[Example-1]

```
/[dat ６月２９日]/、/[dat ３０日]/[tim 晚上]/，/[L 北京市]/下/了/[int 两场]/大雨/，/笔者/
居住/的/宿舍/楼/前/，/宽/[len 六七米]/、/长/[len ３０多米]/的/路/上/积水/达/膝盖/之上/。
<sentence>
<w><TIMEX TYPE="DATE"> ６月２９日 </TIMEX></w><w> 、 </w><w><TIMEX
TYPE="DATE"> ３０日 </TIMEX></w><w><TIMEX TYPE="TIME"> 晚 上
</TIMEX></w><w>，</w><w><NAMEX TYPE="LOCATION">北京市</NAMEX></w><w>
下 </w><w> 了 </w><w><NUMEX TYPE="INTEGER"> 两 场 </NUMEX></w><w> 大 雨
</w><w>，</w><w>笔者</w><w>居住</w><w>的</w><w>宿舍</w><w>楼</w><w>前
</w><w> ， </w><w> 宽 </w><w><MEASUREX TYPE="LENGTH"> 六七米
</MEASUREX></w><w>、</w><w>长</w><w><MEASUREX TYPE="LENGTH">３０多米
</MEASUREX></w><w>的</w><w>路</w><w>上</w><w>，</w><w>积水</w><w>达
</w><w>膝盖</w><w>之上</w><w>。</w>
</sentence> 
```


[Example-2]

```
/[dat ６月中下旬]/，/笔者/到/[L 意大利]/、/ [L 西班牙]/等/国/访问/时/，/一个/很/深/的/感受
/是/[L 意]/、/[L 西]/两国/的/高速公路/非常/发达/，/东西南北/，/纵横/成/网/，/.四通八达/。
<sentence>
<w><TIMEX TYPE="DATE">６月中下旬</TIMEX></w><w>，</w><w>笔者</w><w>到
</w><w><NAMEX TYPE="LOCATION">意大利</NAMEX></w><w>、</w><w><NAMEX
TYPE="LOCATION">西班牙</NAMEX></w><w>等</w><w>国</w><w>访问</w><w>时
</w><w> ， </w><w> 一 个 </w><w> 很 </w><w> 深 </w><w> 的 </w><w> 感 受 </w><w> 是
</w><w><NAMEX TYPE="LOCATION"> 意 </NAMEX></w><w> 、 </w><w><NAMEX
TYPE="LOCATION">西</NAMEX></w><w>两国</w><w>的</w><w>高速公路</w><w>非
常</w><w>发达</w><w>，</w><w>东西南北</w><w>，</w><w>纵横</w><w>成</w><w>
网</w><w>，</w><w>四通八达</w><w>。</w>
</sentence> 
```

[Example-3]

```
/[O 县委]/决定/选派/任/了/[dur 八年]/[O 城建局]/长/的/[P 周欣光]/担任/[O 老干部局]/长/。
<sentence>
<w><NAMEX TYPE="ORGANIZATION"> 县 委 </NAMEX></w><w> 决 定 </w><w> 选 派
</w><w> 任 </w><w> 了 </w><w><TIMEX TYPE="DURATION"> 八 年
</TIMEX></w><w><NAMEX TYPE="ORGANIZATION"> 城建局 </NAMEX></w><w> 长
</w><w> 的 </w><w><NAMEX TYPE="PERSON"> 周欣光 </NAMEX></w><w> 担 任
</w><w><NAMEX TYPE="ORGANIZATION">老干部局</NAMEX></w><w>长</w><w>。
</w>
</sentence>
```


[Example-4]

```
/[L喇嘛寺村]/地处/[L承德避暑山庄]/，/[L山庄]/寺庙/林立/，/僧侣/穿梭/，/[L山庄]/[L外八庙]/的/[ord第一个]/庙/就/是/[L喇嘛寺]/。

<sentence>
<w><NAMEX TYPE="LOCATION">喇嘛寺村</NAMEX></w><w>地处</w><w><NAMEX
TYPE="LOCATION"> 承德避暑山庄 </NAMEX></w><w> ， </w><w>><NAMEX
TYPE="LOCATION">山庄</NAMEX></w><w>寺庙</w><w>林立</w><w>，</w><w>僧侣
</w><w> 穿 梭 </w><w> ， </w><w>><NAMEX TYPE="LOCATION"> 山 庄
</NAMEX></w><w><NAMEX TYPE="LOCATION"> 外八庙 </NAMEX></w><w> 的
</w><w><NUMEX TYPE="ORDINAL">第一个</NUMEX></w><w>庙</w><w>就</w><w>
是</w><w><NAMEX TYPE="LOCATION">喇嘛寺</NAMEX></w><w>。</w>
</sentence> 
```

##### 1.5.2.2*L*, *P*,*O*, *dat*,*tim*,*dur*等实体的边界允许跨越多个词表词

例如：

/[L*瑞典*]/[O*斯德哥尔摩国际和平研究所*]/ /[O*中国工商银行上海市分行*]/

/[tim*下午当地时间*5*时*59*分*]/

1.5.2.3专名的标记（L，P，O）可以插入到一个词表词的内部

例如，词表词抗日战争*和*事后诸葛亮*中的地名和人名应分别予以标注。

/*抗*[L*日*]*战争*/----正确标注。

/*抗日战争*/*----未标出*L，是错误标注。

/*抗*/[L*日*]/*战争*/ ----插入分词标记，是错误标注。

/*事后*[P*诸葛亮*]/

##### 1.5.2.4数字串（除专名以外的其他四类表达式）的标记不得插入到词表词的内部

###### 1.5.2.4.1dat，tim等标记不得插入到一个词表词的内部

词表词*夏令营、*春耕、*冬训*、*早出晚归*中的*夏、春、冬、早、晚*等词素都有*dat*和*tim*的意思，但不得标注。例如，

/[dat*冬*]*训*/ ---错误标注。

/[tim*早*]*出*[dat*晚*]*归* /  ---错误标注。

然而词表词被整体标注为*dat*和*tim*的情况是常有的，例如：

/[dat*初冬*]/ ----*初冬*是词表词。

/*[dat*夏季*]/*----*夏季*是词表词。

/告别/*了*/[dat*冬日*]/*的*/*凝重*/*、*/[dat*春天*]/*的*/*轻盈*/*、*/[dat*夏日*]/*的*/*浪漫*/，

- 注：在文本中具有比喻意义的*春、夏、秋、冬*、历史上的*今天、昨天、明天*不作标注。例如：

/[dat*今年*]/*又*/*迎来*/*了*/*一个*/*科学*/*的*/*春天*/ /"/*在*/*陆地*/*资源*/*日渐*/*减少*/*的*/*今天*/*，*/

/*他们*/*的*/*明天*/*将*/*更加*/*辉煌*/*。*/

###### 1.5.2.4.2int,ord等标记不得插入到到一个词表词的内部

词表词*五湖四海*、*不管三七二十一*、*三纲五常中的数词不允许标注*int（整数）。例如，

/*[int*五*]*湖*[int*四*]*海*/*----错误标注。

/*十年动乱*/*结束*/*不久*/*，*/ ----*十年动乱*是词表词。*十年*不标。

/*不管三七二十一*/ /*三纲五常*/

##### 1.5.2.5数词首、半、双、两等

###### 1.5.2.5.1序数词素首

词表中有许多词含有词素*首*，如*首创、首倡、首选、首发、首航、首飞、首演、首映、首战、首展、首席代表、首席科学家、首席执行官、首富、榜首、魁首、居首*等。但不可把词表词中的词素*首*单独作为*ord*（序数）来标注。

/*首席执行官*/----正确标注。

/*[ord首席]执行官*/----在词表词中插标*ord*是错误的。

以下的词表词属于"首+量词"结构，可以整体作为*ord*标注。例如：

*[ord*首届*]*，*[ord*首次*]*，*[ord*首批*]*，*[ord*首位*]*，*[ord*首例*]*

###### 1.5.2.5.2分数词素半

词表中有许多词含有词素*半**如半价、半票、半饱、半身、半世、半辈子、上半时、下半场、半边*等，但不可把上述词表词中的词素*半*标注为*fra*（分数)。

/*下半场*/*比赛*/[O*中国队*]/*未进*/*一*/*球*/

/*上半时*/

/*下*[fra*半*]*场*/----在词表词中插标*fra*是错误的。

以下的词表词可作为不同的数字串（*dur*，*tim*，*fra*，*int*，*age*）标注：

*[dur*半年*]*，*[dur*半天*]*，*[tim|dur*半夜*]*，*[fra*半个*]*，*[int|age*半百*]*

- 注：半个西瓜中的半个，与四半中的半概念不一样，前一个半是指二分之一，

后一个半是量词，所以标注也不同！！/*[int*一个*]*/*西瓜*/*分为*/[int*四半*]/ /[fra*半个*]/*西瓜*/

###### 1.5.2.5.3整数词素双

当数词双成为词表词的一个词素时，如"双方、双边、双手、双打、双杠、双轨、双层、双目、双亲"等，一律不作为整数(*int*)标注。对于非词表词，只标[*int双*]。例如：

/*窗外*/*又*/*起风*/*了*/*，*/*双层*/*的*/*窗*/*硬是*/*阻挡*/*不住*/*沙尘*/*的*/*侵扰*/*。*/

/*双方*/*认为*/*，*/[L*中*][L*美*]/*两国*/*应该*/*从*/*战略*/*的*/*高度*/*和*/*长远*/*的*/*角度*/

- 注：一般情况下，数词和"方"之间不切分整体标为*int*。但"四方"是词表词所以不标。

[*int三方*]/*会谈*/ /*举行*/*四方*/*会谈*/

以下是相关的例子：

/*用*/*任何*/*一*/*部*/[*int双*]/*音频*/*电话*/*只需*/*拨打*/[pho*２５８０*]/*就*/*可以*/

/*部队*/*进行*/*的*/*海上*/*训练*/*、*/[int*双*]/*机*/*穿云*/*、*/*超低空*/*等*/*高难*/*课目*/*训练*/ /*全村*/[are*７００亩*]/*旱地*/*都*/*种上*/*了*/[int*双*]/*膜*/*棉*/*，*/

###### 1.5.2.5.4整数词素两

当数词"两"成为词表词的一个词素时，如"两国、两会、两地、两者、两头、 两手、两边、两旁、两侧"等，一律不作为数位串（*int*）标注。例如：

/*使*/*两国*/*的*/*友好*/*合作*/*得到*/*巩固*/*和*/*发展*/。

- 注：一般情况下，数词和"国"之间是要切分的，如：[*int五*]/*国*/*元首*/

/*前*/*些*/*年*/*我*/*对*/*参加*/*『*/*两会*/*』*/*总是*/*有点*/*发怵*/*。*/

/*大街*/*两旁*/*店铺*/*林立*/

/*戏台*/*两侧*/*立柱*/*上*/*有*/*这样*/*一*/*副*/*对联*/*：*/

/*中间*/[int*两间*]/*是*/*客厅*/*，*/*两边*/*是*/*卧室*/*和*/*书房*/*。*/

/*对*/*分散*/*居住*/*的*/*"*/*五保*/*"*/*户*/*，*/*镇*/*、*/*村*/[int*两*]/*级*/*拨*/*专款*/

/[int*两*]/*车*/*饮料*/*以及*/*办公*/*桌椅*/*，*/

- 注：临时量词"车、船、床、桌、屋子、院子"等不进入int标注。/*成为*/[O*议会*]/[int*两*]/*院*/*审议*/*的*/*重点*/*和*/*舆论*/*关注*/*的*/*焦点*/

- 注：两院不是词表词，所以应当切分标注如上。

/*及早*/*进行*/*政治*/*谈判*/*推动*/[L*两岸*]/*关系*/*发展*/ /*沿江*/[int*两*]/*岸*/*苗家*/*吊脚楼*/*上*/*的*/*观众*/

- 注：词表词两岸是专指台湾海峡两岸的地名。如果泛指江河两岸，则不作为 地名标注，而且要切分并标数词"两"为*int*。

/*一下*/*进*/*了*/[int*两*]/*球*/

#### 1.5.3基本规定

1)标注时，不得在原来的文本中加入回车换行符。

2)对于NIST制定的两个中文NE标准：MET-2和ER-99。前者已有系统参加评测，它们的评测结果可供后来者参考；后者是前者的修订版,但尚未有系统参加测试。本规范与这两种标准不同之处将尽可能在注释中加以说明。例如：

/[dat*去年上半年*]/

- 注：MET-2把去年上半年*整体视为*dat*；ER-99则只将上半年*视为*dat*。

3)对于微软研究院根据自己的需要而加入的标记，本规范也将在注释中加以说明。比如本规范要求的如下标注：

/[P*邓小平*]/*理论*/

- 注：MET-2和ER-99规定，*理论，主义，思想，定律*等词前面的人名均不作为专名标注（见2.8）。

## 第二章 专有名词标注总则 

### 2.1专有名词（NAMEX）标注通则

对于人名、地名和机构名这三类专有名词，MET-2和ER-99之间的差异甚微，在它们给出的示例中，只有两处不同：中南美*和长江流域*（具体情况见后）。所以在制订人名、地名、机构名的标注规范时，我们没有刻意去区分这两个标准，而是力图把它们统一地融入本规范。

下面给出人名、地名、机构名的定义。

### 2.2专有名词是具体的、特定的，而不是抽象的、泛指的

比如：*上苍、老外、姑娘，小镇，企业*等就不应视为专有名词。

### 2.3复合专有名词的标注不允许嵌套

在MET-2和ER-99标准中，任何命名实体都不允许嵌套。换句话讲，只标一个实体的最长边界，不标其内部包含的其它实体。

### 2.4人名、地名、机构名中的数字串不单独标出

例如:

/[P*龟山一郎*]/

/[L*德富路二四一至二六三号*]/

/[O*北京*101*中学*]/

/[O*北京*[ord*四*]*中*]/ ----这种嵌套式的的标注是错误的！

### 2.5含有外文和数字的命名实体应整体一起标注

例如：

/[O*American航空公司*]/

/[O*SONY公司*]/

### 2.6当两个实体用虚词的连接时应分别标注为两个实体

例如:

/[L*美国*]/*的*/[L*纽约*]/

/[L*美国*]/*的*/[P*理查德本森*]/

但当*的*成为实体的一部分时，要整体一起标注。例如：

/[O*美的电器集团*]/

### 2.7实体前后有引号或书名号的情况

如果一个命名实体中间有引号或书名号，则引号或书名号是该实体的一部分。如果一个实体被外面的引号或书名号括起来，那么其引号或书名号就不作为实体的一部分标注。例如：

/[O"*阿克布拉克*"*中哈合资企业*]/ 

/[O*美国《幸福》杂志*]/

/*《*/[O*星岛日报*]/*》*/*的*/*社论*/*说*/

### 2.8短语内部包含实体、但整体又不是命名实体的情况

ER-99规定：如果一个短语内部包含实体、但整体又不是命名实体，则一律不作标注。本规范则要求对该短语中的实体部分加以标注。例如:

/[L*巴拿马运河*]/*条约*/

- 注：ER-99认为,巴拿马运河条约*整体不能分解，其中的地名不应标注。但本规范把其中的巴拿马运河*单独标为地名。

/[L*巴拿马运河*]/*----巴拿马运河*单独出现时，作为地名标注。

/[L*香港*]/*脚*/

- 注：英文为"HongKongfoot"，类似于"athletesfoot"，不可分解，所以ER-99规定整体不标。本规范，仍将*香港*标为地名。

/[L*美国*]/*小姐*/

- 注：原文为"MissAmerica"，指选美活动中获全美第一名的小姐。对此ER-99规定整体不标。本规范，仍将*美国*标为地名。

/[L*美国*]/*姑娘*/*----ER-99对本例的美国*也是标注的。

/[ord*第四十六届*]/[O*太平洋亚洲旅行协会*]/*年会*/

- 注：此例在ER-99中整体不标，理由是不可分解。本规范认为找不出充分理由说明其不可分解。所以我们把太平洋亚洲旅行协会*标为机构名。第四十六届太平洋亚洲旅行协会年会*整体不是机构名。

/[P*毛泽东*]*思想*/ /[P*马克思*]*主义*/

/[P*马克思*]/*主义*/ ----*错误标注！因为*马克思主义*是词表词。

/[P*阿佛加罗*]/*定律*/

- 注：ER-99规定，在理论、主义、思想、定律*等词前面出现人名时，是整体不可分解的字符串；因此该字符串和其中的人名都不标注。但本规范仍将标注其中的人名。

### 2.9与军队相关的情况

当泛指某个国家的军队（如英军、美军*等）时，不是机构名；当指一个具体的军种（如空军、陆军、海军*等）时，要标注为机构名。例如：

/[L*美*]/*军*/*飞机*/

/[O*斯里兰卡空军*]/ 

/[O*英国皇家空军*]/

但是，有如下特殊情况：

*[L*济南军区*]/ ----*军区是*L*而不是*O*。

/[L*彼得森空军基地*]/ -----军事基地是L而不是O。

/[L*西非*]/&*维*/*和*&/*部队*/ ------部队不作为机构名标准。

### 2.10多媒体、产品和条约中的人名、地名、机构名

ER-99规定：当人名、地名、机构名属于多媒体、产品和条约时，均不加标注。但本规范对上述实体名还是要标注的。例如：

/[P*邓小平*]/*一*/*片*/*的*/*播出*/

- 注：ER-99规定，电视节目的名字邓小平*不标。本规范仍把它标为人名。此外，邓小平*作为片名，在规范的文本中应当用书名号括出，如《邓小平》。

/*二战*/ ----*二战*是事件,所以不标注。

/[L*香港*]/*百*/*题*/*今天*/*为*/*您*/*解答*/

- 注：ER-99规定，香港百题是电视片的标题，所以专名香港不予标注。但本规范仍把香港标为地名。下面其它的例子就不一一解释了。

/*这*/*本*/*介绍*/[P*毛泽东*]/*的*/*小说*/ ----*毛泽东*要标注。

/*这*/*本*/*名*/*为*/[P*毛泽东*]/*的*/*小说*/ ----ER-99*毛泽东*不标。

/[L*广州*]/*条约*/ ----ER-99*广州*不标。

/[L*辽*][L*沈*]*战役*/ ----ER-99*辽沈*不标。

本规范在后面还要对人名、地名、机构名中不加标注的情况作专门的说明，详见下面的各章节标注细则。

### 2.11别名或简称的标注

对人名、地名、机构名的别名或简称要标注。例如：

/[O*IBM*]/

/[L*深*]/[L*沪*]/*股市*/ 

/[O*北约*]/

/[L*中*][L*美*]/*首脑*/*互访*/

/[L*中*]/[L*文*]/*双方*/*一致*/*认为*/

- 注：由于中美是词表词，标注地名时不可插入分词标记。中文也是词表词，但这里是指中国和文莱，所以标成地名时需要在两个简称中插入分词符号。这样的词表词还有中意、意中、中巴、日中、中肯、中非等。巴中是一个地名，但表示巴基斯坦和中国时需要用分词符号把两个简称分隔开。

- 注：对于简称中嵌套的人名、地名、机构名不予标注，如：

/[O*中共*]/ ----*中*指*中国*，但不标。

/[O*中共中央政治局*]/ - ---同理，不标注*中*。

## 第三章人名

人名一般包含姓和名两部分，姓是表明家族的字，有单姓和复姓之别；名也就是名字，是一种称号，由一个或几个字组成，跟姓合在一起，用来代表一个人，以区别于别的人。下面将对人名的标注规则进行详解。

### 3.1人名标注规则

正常情况下，人名一般包含姓和名两部分，标注规则如下表所示：

| **序号** | **情况** | **标记方法** |例子 |
| --- | --- | --- | --- |
| 1 | 只含姓，没有名 | 标出姓氏部分 | *[P*庄*]*、*[P*欧阳*]*、*[P*司马*]* |
| 2 | 只包含名字 | 标出名字部分 | *[P*育焜*]* |
| 3 | 姓名 | 姓名整体标出 | *[P*苏宗哲*]*、*[P*萝莉胡吉温*]* |
| 4 | 姓名|姓|名+称谓称谓+姓名|
| 5 | 前缀+姓名|姓|名姓名|
| 6 | 姓名+姓名 | 分开来标 | *[P*李向东*]/[P*李向阳*]* |
| 7 | 外国人名 | 作为一个整体来标 | *[P*罗马里奥*]*[P*马拉多纳*]*[P*比尔*•*盖茨*]* |

- 说明：当人名中包含•时，整体标注为人名，如[P*比尔•盖茨*]。

### 3.2人名标注细则

#### 3.2.1人名的示例和详细说明

#### 3.2.1.1人名实例

/[P*颜惠忠*]/ 

/[P*连战*]/ 

/[P*凡*•*高*]/

/[P*陈方安生*]/

---当妻子与丈夫的名字写在一起时，要作为一个人名整体标注为P！

#### 3.2.1.2称谓、绰号、官职不作为人名的一部分

称谓、绰号、官职(如先生、总理等)不作为人名的一部分。例如，

/[P*张*]/*经理*/ 

/[P*李*]/*市长*/

/[P*陈*]/*姓*/*游客*/*说*/ 

/[P*刘*]/[ord*二*]/*嫂*/ /[P*周*]/*总理*/

/[P*雷锋*]/*同志*/

/[P*奥尔布赖特*]/*国务卿*/

#### 3.2.1.3当称谓和姓名不可分时应整体标注为人名

/[P*李主席登辉*]/*先生*/

/*处*/[P*李犯清龙*]/*死刑*/*，*/

/[P*李犯*×*龙*]/*持*/*刀*/*行凶*/*杀害*/*无辜*/*青年*/*，*/

#### 3.2.1.4几世、几代要作为人名的一部分

/[P*十四世达赖丹增加措*]/

/[L*英国*]/*女王*/[P*伊丽莎白二世*]/

#### 3.2.1.5家族实体

/[P*蒋*]/*氏*/*父子*/

/[P*西迪*]/*兄弟*/

#### 3.2.1.6圣人和宗教人物要标注为人名

/[P*释迦穆尼*]/ 

/[P*达赖*]/*喇嘛*/

### 3.3虚构的人物、动物的名字要标注为人名

#### 3.3.1在童话、小说中虚构人物要标注为人名

/[P*孙悟空*]/ 

/[P*玉皇大帝*]/

#### 3.3.2虚构的动物和非人的人物要标注为人名

/[P*唐老鸭*]/

/[P*花仙子*]/

/"/[P*盼盼*]/"/*是*/*国内外*/*著名*/*的*/*熊猫*/*明星*/*，*/

/*争相*/*目睹*/*狮*/*王*/[P*木法沙*]/*和*/*王后*/[P*色拉碧*]*产下*/*的*/*小*/*王子*/[P*辛巴*]/

/*走进*/*一家*/*饭馆*/*，*/*发现*/*老板*/*就*/*是*/*大*/*灰*/*狼*/[P*罗克*]/*。*/

#### 3.3.3用称谓或朝代等名号来指称特定人时要标注为人名

例如：

/[P*康熙*]/ 

/[P*乾隆*]/ 

/[P*秦始皇*]/ 

/[P*老子*]/ 

/[P*孔子*]/

### 3.4不标注为人名的各种情况

#### 3.4.1虚构的非人的植物的名字不作为人名标注

如：

/"/*彩霞*/*，*/"/*石子*/*小声*/*嘟哝*/*着*/*，*/"/*多*/*恶心*/*的*/*名字*/*！*/"/

/*电磨*/*姐姐*/*故意*/*气*/*气*/*小*/*毛驴*/*，*/*说*/*：*/"/*输*/*了*/*，*/*可*/*不能*/*哭鼻子*/*。*/"/ 

/"/*卡车*/*哥哥*/*，*/*我*/*和*/*你*/*来*/*比*/*一*/*比*/*谁*/*运*/*得*/*多*/*，*/*怎么样*/*？*/"/

/*好像*/*在*/*说*/*：*/"/*荷花*/*姐姐*/*，*/*你*/*好*/*！*/

#### 3.4.2对于嵌套在地名和机构名中的人名，不作标注

如：

/[L*嘉诚广场*]/

/[O*中山大学*]/

/[O*宋庆龄基金会*]/

#### 3.4.3作为书名或画名的人名

作为书名或画名的人名ER-99不标（见2.8），但本规范是要作标注的。如：

/*世界*/*名画*/*《*/[P*蒙娜莉萨*]/*》*/

*/《/*[P*蒋介石*]/*与*/[P*毛泽东*]/*》*/

3.4.4法律、法庭事件、天气形成、疾病和奖金等五种情况

当人名后面紧跟法律名、法庭事件、天气形成、疾病、奖金这五种情况时，人名不标注。

例如：

/*里*/*氏*/[ord*六点二级*]/ -----*里*不标。

/*专家*/*呼吁*/*人们*/*要*/*注意*/*沙*/*氏*/*杆菌*/ -----*沙*不标。

/[P*诺贝尔*]*奖*/ -----ER-99*诺贝尔*不标。

#### 3.4.5在人名后面出现基金会时要整体标注为机构名

/[O*李嘉诚基金会*]/

所以基金会*和奖、奖金*是不同的两种情况，需加以区别。又如

/[O*李嘉诚股份有限公司*]/ 

/[O*诺贝尔股份有限公司*]/

## 第四章 地名

地名包括洲、海洋、国家、省、市、县、地区、街道、乡、镇、村、机场、军事基地、军区、铁路、公路、桥梁、海峡、海湾、港湾、河流、湖、公园、草原、煤矿、牧场、养殖场、音乐厅、剧院、教堂、寺庙、图书馆、博物馆、美术馆、展览中心、公园、动物园、植物园、火车站、广场、大厦、大楼、体育场(馆)、游泳馆(池)、赛车场、商城、超市、书店(城)等城市公共设施，还包括某些特定的城市建筑和虚构的处所。详见下表。

### 4.1地名标注规则

| **序号** | **情况** | 标记方法 |例子 |
| --- | --- | --- | --- |
| 1 | 只是单独地名 | 标出地名部分 | *[L*中国*]*[L*竹塘乡*] |
| 2 | 地名+地理(行政)单位 | 作为整体标出 | *[L*北京市*]*[L*台北县*]*地理单位如：省、地区、市、县、乡、镇、村、店、庙、沟、屯、坟、崖、海洋、河、川、江、峡谷、海湾、港湾、丘陵、湖、半岛、三角洲、区、街、路、街、街道、社区、小区、公寓、音乐厅、剧院、图书馆、博物馆、美术馆、展览馆、公园、动(植)物园、火车站、广场、大厦、大楼、体育场(馆)、游泳馆(池)、赛车场、商城、超市、书店(城)等城市公共设施及象征性建筑物、军事基地、军区等。*[L*天安门广场*]*[L*艾菲尔铁塔*]* |
| 3 | 包含上、下位的地名（即合成地名）以及并列的地名 | 一律分别单独标出 | *[L*山东省*]/[L*青岛市*]/[L*胜利广场*]*[L*青岛市*]/[L*孙中山广场*]*[L*北京市*]/[L*海淀区*]/[L*知春路*]/[L*希格玛大厦*]*[L*北京*]/*、*/[L*天津*]/*、*/[L*上海*] |
| 4 | 地名简称 | 单独标出 | *[L*鲁*]/*、*/[L*冀*]/*、*/[L*京*]* |
| 5 | 并列的简称 | 单独标出 | [L*中*]/[L*俄*]/*两国*/*领导人*/*进行*/*了*/*会晤*[L*港*][L*澳*][L*台*]/地区 |
| 6 | 地名包含人名以及地名包含地名的情况 | 地名中的人名、地名不标 | *[L*李嘉诚广场*]*[L*南京路*] |
| 7 | 地名+地名关键词表达一个完整的概念时 | 相对完整的地名 | *[L*南非共和国*]*[L*宁夏回族自治区*]*[L*香港特别行政区*]* |

### 4.2地名标注细则

#### 4.2.1地名实体示例

/[L*北京*]/ 

/[L*亚洲*]/

/[dat*2008年*]/[L*奥*]*运会*/*，*/[L*中国*]*人*/

/[L*中国*]*人民*/  ----*中国人、中国人民*都是词表词。

/[L*朝鲜*]/*南北*/*对话*/ ----*不标注南*,北。

- 注：词表词"京剧、京白、京腔、京味儿"中的"京"字要标注为：

/[L*京*]*剧*/*、*/[L*京*]*白、*/[L*京*]*腔*/*、*/[L*京*]*味儿*/

/[L*台东火车站*]/ 

/[L*卑南文化公园*]/

/[L*基隆文化中心广场*]/

/[L*高雄港第一港口*]/

/[L*苏澳镇*]/[L*南方澳渔港*]/

/*环*/[L*渤海湾*]/*地区*/*的*/*天然气*/*市场*/

/*来自*/[L*沈阳军区*]/*各*/*集团军*/

/[L*梅狮路后段*]/

/[L*中横公路天祥段*]/

/[L*华禄溪*]/*及*/[L*碧绿隧道*]/ 

/[L*南二高*]/[L*高雄支线*]/

/[L*台廿一线*]/

/[L*美国空军基地*]/

/[L*上海*]/[L*国际航运大厦*]/ 

/[L*上海*]/[L*虹口足球场*]/

/[L*上海博物馆*]/

/[L*上海*]/[L*城市规划展示馆*]/ 

/[L*石家庄*]/[L*富强电力新村*]/

/[L*西安第二长途通讯大楼*]/

/[L*北京市*]/[L*王府井百货大楼*]/

/[L*广深铁路*]/*以及*/[O*深圳发展银行*]/*部分*/*高官*/*也*/*被*/*免职*/ 

/[L*汉江*]/*上*/*的*/[L*圣水大桥*]/

/[L*新亚欧大陆桥*]/ 

---从世界知识知道此处大陆桥的名字叫*新亚欧大陆桥*，是不可分解的。

#### 4.2.2地名指示词（如国、省、市等）视为地名的一部分一起标注

地名指示词（如国、省、市等）视为地名的一部分一起标注。复杂的、具有包含关系的地名要分开标注，但分开标注时不可把一个有完整意义的地名拆散。以下是正确的标注：

/[L*德国联邦*]/*政府*/*总理*/

/[L*基隆市*]/ 

/[L*台东县*]/ 

/[L*南山部落*]/

/[L*美国*]/ [L*马里兰州*]/

/[L*约旦河*]/

/[L*朝鲜半岛*]/ 

/[L*长江三角洲*]/ -----*长江三角洲*是词表词。

/[L*吉林省*]/[L*延边朝鲜族自治州*]/[L*图们市*]/

以下两例均为错误的标注，因为*延边朝鲜族自治州*是具有完整意义的地名：

/[L*吉林省*]/[L*延边*]/[L*朝鲜族自治州*]/[L*图们市*]/

/[L*吉林省延边朝鲜族自治州*]/[L*图们市*]/

- 注：在ER-99的标准测试集中，把中国西昌卫星发射基地*整体标为地名。我们认为这是错误的，因为在一个地名中不应当包含具有上、下位关系的另一地名。正确的标注是：

/[L*中国*]/[L*西昌卫星发射基地*]/

/[L*美国洛克希德·马丁卫星测控中心*]/*和*/[L*中卫公司测控站*]/

/*从*/[L*法*]/*属*/[L*圭亚那*]/[L*库鲁航天中心*]/*发射*/

- 注：本规范不采用ER-99的标注：*[L*法属圭亚那库鲁航天中心*]*。

/[L*武汉*]/[L*长江大桥*]/

/[L*上海*]/[L*中山公园*]/

- 注：尽管其它城市也有长江大桥和中山公园，但在当地它们已构成完整的地名，所以应单独标注。

/*位于*/[L*朝阳门*]/*外*/*商务*/*区*/*之中*/*，*/

/[L*盛华公寓*]/*坐落*/*于*/[L*西直门*]/*内*/[L*冠英园小区*]/

- 注：内、外都不在标注范围之内，但如果地名中的内、外去掉不能说明是一个完整的地名时，内、外要标注在地名内。如：

/[O*外交部*]/*位于*/[L*北京市*]/[L*朝阳门内南小街*52*号*]/

/[L*西直门外大街*71*号*]/

4.2.3并列的地名应分别标注

对于并列的多个地名应分别标注。对于嵌套在地名中的人名、地名和机构名不再单独标注。例如：

/[L*中*]/[L*意*]/*双方*/ ----*中意*是词表词，作为国名时要切开。

/[L*香港*]/*和*/[L*澳门特别行政区*]/

/*目前*/*已*/*有*/[int*１２个*]/[L*中*]/*、*/[L*东欧*]/*国家*/

/[L*北京*]/[L*上海*]/

/[L*科*]/[L*伊*]/*边境*/

#### 4.2.4跨国家的和国家内部的地名

/[L*西非*]/*国家领导人*/

/*从*/[L*陕*]/[L*甘*]/*革命*/*老区*/*到*/*沿海*/*经济特区*/*，*/

/[L*亚太*]/----亚太是词表词，它是一个地名，而不是两个地名。

/[L*近东*]/*和*/[L*北非*]/

##### 4.2.4.1表示地理方位的名词

一些表示地理方位的名词如*南半球、北半球、江南、江北、西南、西北、华南、华北、华中、东北*等虽然不完全具备确指性，也要作为地名标注为*L*。

/[L*汉水*]/*流域*/*、*[L*西南*]/*地区*/*东部*/

/[L*江南*]/*大*/*部*/*、*[L*华南*]/*有*/*小*/*到*/*中雨*/

/*近*/[dur*两天*]/*造成*/[L*东北*]/*、*/[L*华北*]/*地区*/*的*/*降雨*/*天气*/*系统*/

/*迫使*/[L*北半球*]/*的*/*副热带*/*高压带*/*在*/[L*青*][L*藏*]/*地区*/"/*断裂*/"/

- 注：上述地名后面的方位词*南部、北部、东部、西部*不应包括在地名的括号里，

因为其所指的区域是更不确定的。

##### 4.2.4.2方位词修饰地名实体时要整体标注为L

/[L*东西九龙*]/ ----这是一个并列的地名。

/*一代*/*又*/*一代*/*海*/*测*/*官兵*/*犁*/*波*/*耕*/*浪*/*于*/[L*南中国海*]/*，*

/[L*北爱尔兰*]/

/[L*中西伯利亚*]/

- 注：ER-99将此例标为*中*/ [L*西伯利亚*]。我们认为它整体是一个专指性的地名。

/[L*中南美*]/

/[L*东南亚*]/

- 注：ER-99要求把上面两个地名分别标注为[L*中*]/[L*南美*]*和*/[L*东*]/[L*南亚*]/*。其实中南美*指*中美*和*南美*两个地名，而*东南亚*是一个地名。这样的细节需要专门的地理知识才能做出判断。所以我们不遵循ER-99的这条规则。

#### 4.2.5地名实体受时间词修饰时，时间词不标

/*前*/[L*苏联*]/

/*前*/[L*南*]/*地区*----*南*指南斯拉夫，时间词*前*不标。

#### 4.2.6 只有经纬度在一起时才能标注为 **L**

只有经纬度在一起时才能标注为L，否则经度或纬度单独标为角度*ang*。如：

/*震*/*中*/*位于*/[L*北纬三十六点二零度，东经九十点二九度*]/

/*并*/*将*/*卫星*/*定点*/*在*/[L*东经*110.5*度赤道*]/*上空*/*。*/ 

/*震*/*中*/*位于*/[ang*北纬*30.5*度*]/*，*/

#### 4.2.7天体的标注

/[L*宇宙*]/ 

/[L*地球*]/ 

/[L*太阳*]/

/[L*太阳系*]/ 

/[L*银河*]/

/[L*银河系*]/

/[L*月亮*]/ 

/[L*海王星*]/

/[L*东方红三号*]/

/[L"*鑫诺１号*"*卫星*]/

- 注：火箭只是卫星的发射工具，故火箭型号不作为星体标注。

/[dat*９６年２月１５日*]/*长征*/[ord*三号乙*]/*火箭*/*发射*/*失利*/，

/*长*/[ord*二*]/*捆*/*火箭*/ ----*全名为*"*长征二号捆绑式运载火箭*"。

### 4.3不作地名标注的示例

/[L*阿*]/[L*以*]/*冲突*/

- 注：ER-99和MT-2认为阿（阿拉伯）不是一个特定国家的简称，本规范不采纳他们的规定。

/*回答*/*了*/[L*中*]*外*/*记者*/*的*/*提问*/ ---*外*不标。

#### 4.3.1地区一般不作为地名的一部分标注

仅当*地区*特指行政单位时，才被视为地名的一部分。一般情况下，*地区*泛指一片地方，不是地名的一部分。若不能确定时，*地区*不作为地名的一部分标注。

/[L*港*][L*澳*][L*台*]/*地区*/ -----*港澳台*是词表词。

/[L*巴尔干*]*地区*/

/[L*临沂*]/*地区*/*现*/*更名*/*为*/[L*临沂市*]/

#### 4.3.2平原、山脉、山区、盆地、沙漠、流域不在标注范围内

*平原、山脉、山区、盆地、沙漠、戈壁、流域、故里、故居、纪念馆、风景区、开发区、经济区*等都不在地名标注范围内。但当某某故居、故里、纪念馆成为一个对外开放的旅游景点时，才作为地名标注。如：

/[L*云*][L*贵*]*高原*/

----*云贵高原*是词表词不可分割，但云、贵要分别标注*L*。

/[L*成都*]/*平原*/

/[L*秦岭山*]/*脉*/

/[L*秦*]/[L*巴*]/*山区*/

/[L*四川*]/*盆地*/ 

/[L*撒哈拉*]*沙漠*/ ----*撒哈拉沙漠*是词表词。

/[L*长江*]/*流域*/

/[L*毕加索故居*]/

/*造型*/*典雅*//*毗邻*/[L*青云岩*]/*风景区*/*及*/[L*北山湾*]/*旅游区*/

*[L*约旦河西岸*]*----因为*约旦河西岸*是专指。

/[L*海峡两岸*] /  ----指*台湾湾海峡两岸*。

/[L*两岸*]/

- 注：词表词*两岸*只有在表示台湾海峡两岸时，才作为地名标注为*L*，当作为*江河、湖泊*的两岸时，*两岸*要切分标注。如：

/[L*长江*]/*的*/*丰姿*/*和*/[int*两*]/*岸*/*的*/*美景*/*尽收眼底*/*。*/

/*祖国*/[L*大陆*]/

- 注：内地虽然指中国大陆，但不作为地名标注，这里遵从了ER-99的规定。特区只有在确指是香港和澳门时才作标注。如：

/*来自*/*内地*/*和*/[L*香港特区*]/

/[L*特区*]/*政府*/*和*/[L*香港*]/*同胞*/*正*/*以*/*喜悦*/*的*/*心情*/ 

/[L*中国*]/[L*厦门*]/*经济特区*/

#### 4.3.3对语言文字前的单音节地名不标，双音节的地名标注为L

*英语*----对*英*不标注。

*汉语*----对*汉*不标注。

*中文*----对*中*不标注。

/*对*/[L*西藏*]/*地区*/*的*/*藏语*/*广播*/

/*主张*/*台语*/*在*/[L*台*]/

/*用*/[L*四川*]/*话*/ ----如果*语、文*前面的地名为双音节时，就要标注。

/[L*荷兰*]/*语*/

#### 4.3.4以族或裔结尾的词组中地名也要标注

MT-2和ER-99规定：以族或裔结尾的词组中的地名不标注。因此*华裔*、*汉族*中的*华*和*汉（指汉族）*都不作为地名标，但*华人、华侨、华商、中医、中草药、中餐馆、亚运会、奥运会*里的*华、中、亚、奥*仍需标注*L*。本规范不采用这一规则。作为民族的名字，单音节的不标，双音节的标*L*。

下面是一些标准实例：

/[L*美*]*籍*[L*华*]人----"美籍华人"是词表词。

/*目的*/*是*/*促进*/[L*塞浦路斯*]/*西*/*族*/*与*/*土*/*族*/*的*/*和解*

/*她*/*和*/*同*/*是*/[L*日*]/*裔*/[int*三*]/*世*/*的*/*男*/*友*/

/*通过*/*在*/[L*中*]*医药*/*宝库*/*里*/*寻找*/*线索*/

/*人们*/*纷纷*/*拥向*/[L*中*]*餐*/*馆*/*，*/*一时间*/*人满为患*/

/[L*吉普赛*]/*人*/----*吉普赛*不是词表词。

/[L*印地安*]/*民族*/*；*/ ----*印地安人*是词表词。

## 第五章 机构名

机构名包括：股票（证券）交易所、国家或国际组织、商业团体（公司、企业、工厂）、电视台、广播电台、报刊杂志、出版社、政党或党派、学校、科研院所、医院、诊所、邮电局、乐队、体育运动队、联盟、议会或代表大会、军队、咖啡厅、酒吧、饭店、旅馆，以及虚构的机构等。

### 5.1机构名标注规则

机构名的后缀应视为机构名的一部分。

| **序号** | **情况** | 标记方法 |例子 |
| --- | --- | --- | --- |
| 1 | 普通名字+机构名 | 整体标出 | *[O*板桥市胜捷公司*]* |
| 2 | 地名+机构名 | 机构名整体标出 | [O*北京市电信局*]*[O*台北县立莺歌高职*]*[O*台北看守所*]*[O*基隆长庚医院*]*[O*东直门敬老院*]机构名的关键词如：幼儿园、各级学校、科学院、部委、实验室、工厂、公司、报刊杂志、出版社、大使馆、领事馆、咖啡店、快餐店、饭店、酒店、旅馆等 |
| 3 | 人名+机构名 | 机构名整体标出 | *[O*李嘉诚基金会*]* |
| 4 | 简称 | 一律整体标注 | *[O*北约*]*[O*上轮集团*]----*指上海轮胎集团*[O*白宫*]/*官员*/表示 |

### 5.2机构名标注细则

#### 5.2.1机构名标注实体示例

/[O*国防部*]/*长*/[P*迟浩田*]/ 

/[O*美国国防部*]/*长*/[P*佩里*]/

/[O*台北县地政局地权课*]/

/[O*地政局*]/

/[O*政风室*]/*接*/*获*/*检举*/*调查*/

/[O*国军北投医院*]/

/[O*三重地政事务所*]/

/[O*台湾银行宜兰分行*]/ 

/[O*省立关山工商*]/

/[O*基隆市光隆家商*]/

/[O*东信国小*]/ 

/[O*安乐国中*]/

/[O*原住民委员会*]/

/[O*连萧全国竞选总部*]/

/[O*北京钓鱼台国宾馆*]/ 

/[L*浙江*]/[O*温州大酒店*]/ 

/[O*松下电工株式会社*]/

/[O*公司*]/*英文*/*名称*/[O *HUNAN* FORE *SCAPE* TECHNOLOGY*CO*．，*LTD*]/

/[O*朝鲜人民武装力量部*]/*副*/*部长*/

/[O*美国海军*]/ 

/[O*欧共体*]/

/[O*中国国家生育委员会*]/ 

/[O*中国奥林匹克队*]/

/[O*披头四*]/ 

/[O*飞虎队*]/

/*敢死队*/ -----泛指不标。

/*但是*/[O*共和党*]/*人*/*说*/

/[O*土耳其议会外交关系委员会*]/ 

/[O*终战*50*周年国会议员联盟*]/

/*记者*/*来到*/[O*中山医科大学第一附属医院住院部*]/

/[O*中共中央政治局*]/*常委*/*、*/[O*中央纪委*]/*书记*/[P*尉健行*]/

- 注：中国共产党的简称中共或共要标注为O。例如：

/[ord*第二次*]/[O*国*]/[O*共*]/*合作*/

- 注：类似的简称党，由于专指性不强，不标，如：

/但/这种/现象/的/产生/，/是/同/党/和/国家/尊师重教/的/方针/背道而驰/的/，

/全国/"/[dat三八]/"/红旗手/、/全国/优秀/共青团员/

- 注："三八红旗手"是词表词。但如果"三八"在文中被双引号断开，就要单独表为dat。另外，词表词共青团员、共产党员、共产党人、中的机构名不确指，所以一律不标。

/[O*中共中央政治局常委会*]/

- 注：常委会可以是机构名，常委则不是。

/*党*/*的*/[O*十四大*]/*以来*/

- 注：中共的*X中全会*不是机构名，除了词表词*三中全会*什么也不标以外，数词*X*应单独标注为*ord*。例如：

/*根据*/*党*/*的*/[ord*十五届*]/[ord*二*]/*中*/*全会*/

/[O*八届全国人大*]/*代表*/[P*陈妙珍*]/

/[O*西藏政协*]/*委员*/*强调*/*，*/*必须*/*旗帜*/*鲜明*/*地*/*反对*/*民族*/*分裂*/

[O*澳门中华总商会*]/*会*/*董*/*兼*/[O*青年委员会*]/*副*/*主任*/

/[O*足协*]/*杯赛*/*冠军*/[O*北京国安队*]/ ----*杯赛*是词表词。

/[O*以国家电视一台*]/ ----指以色列国家电视一台

/[L*汉城*]/[O*路透*]/*电*/

/*前*/[L*苏联*]/[O*切尔诺贝利核电站*]/*泄漏*/*事件*/

/*参加*/*这次*/*比赛*/*的*/*还有*/[O*日本*]/*、*/[O*俄罗斯*]/*、*/[O*美国*]/*、*/[O*德国*]/

和/[O*意大利队*]/*。*/

/*前往*/[O*解放军驻港部队总部*]/*慰问*/*驻军*/

/[O*第四届和平小天使台湾访问团*]/*抵达*/[L*重庆直辖市*]/

/[O*塔里班*]/*部队*/*已经*/*到达*/[P*杜斯塔姆*]/*将军*/*的*/*家乡*/

/*用*/*公款*/*购买*/[O*靖国神社*]/*和*/[O*护国神社*]/*的*/*祭祀*/*品*/

/*纪念币*/*正面*/*是*/*由*/[O*解放军*]/*军徽*/*光，*/*八一南昌起义*/*和*/[O*解放军*]/[O*陆*]/[O*海*]/[O*空*]/*三军*/*战士*/*的*/*图案*/

- 注：词表词八一南昌起义*是一个事件，不是机构名。三军*是词表词，所以数字*三*不作为*int*标注。

- 注：股市报导中的企业和公司名不论其前后有没有外文字符，一律作为一个整体

标注成*O*。例如：

/[O*ＳＴ辽物资*]/[dec*１４．１４１*]/[O*宁波中百*]/[dec*２０．３５４*]/

/[O*ＤＲ沪港机*]/[dec*１１．１９４*]/[O*鲁北化工*]/[dec*８．０５１*]/

- 注：商城或百货公司本应标注为L，但作为股市中时企业时应标注为O。

- 注：股票指数在没有明确说明是多少元的情况下一律标注为*int*或*dec*。

- 注：被命名的轮船、飞机、机车应标注为*O*。例如：

/*却*/*购*/*回*/*了*/[int*３张*]/ [*O"长月"号轮船*]/*船票*/*，*/ 

/[O*泰坦尼克号游轮*]/*上*/*的*/*这*/*对*/*情人*/*实在*/*浅*/*得*/*很*/*。*/ 

/[O*美国"哥伦比亚"号航天飞机*]/*上*/*的*/*宇航员*/

#### 5.2.2机构名的后缀是机构名的一部分

机构名的后缀是机构名的一部分，即要准确的标出机构名的最长边界（机构名的全称）。机构名中可以包含人名、地名和机构名，但对于它们不再单独标注。例如：

/[O*苗栗县环保局*]/

/[O*卫生署桃园医院*]/

/[O*兰阳民生医院*]/*前身*/*为*/[O*吴外妇科*]/

/[O*台北爱乐青年管弦乐团*]/

/[O*行政院农委会林业试验所福山分所*]/ 

/[O*宋庆龄基金会*]/

/[O*上海轮胎橡胶（集团）股份有限公司*]/ 

/[O*中国驻日本大使馆*]/

/[O*美国白宫*]/

/*前*/[O*中国新华社香港分社*]/*社长*/[P*许家屯*]/ 

[O*清华大学计算机系人工智能实验室*]/

[O*中保财产保险四川省分公司*]/

#### 5.2.3国家（或国际）立法部门或行政部门标注为机构名

/*当选*/[O*国会*]/*议员*/

/[O*内阁*]/*改组*/*将*/*会*/*在*/[dat*八月底*]/*前*/*完成*/

/*前*/[O*内阁官房*]/*长官*/[P*山静六*]/

/[P*刹瓦什*]/*向*/[O*宪政法庭*]/*提出*/*动议*/

5.2.4地名和机构名紧邻时的情况

地名和机构名的关系一般有以下两种情况：

(1)表示所属关系（如：法国航空航天局，航空航天局隶属法国）。

(2)表示地理位置关系（如：北京邮电大学表示大学位于北京，而不是隶属于北京）。

地名和机构名之间还可能有更复杂的情况，这里不予讨论。

##### 5.2.4.1规则一

如果机构名以一个地名开头，而且删除这个地名后所剩部分不再是一个具有特指性的机构名，那么该地名必须留在机构名中作为该机构名的一部分标注；

/[O*北京大学*]/ 

/[O*深圳中学*]/

/[O*复旦大学专用集成电路与系统实验室*]/

/[O*东南大学*]/[O*深圳宝安设计院*]/

##### 5.2.4.2规则二

如果机构名前面还有一个或多个地名，那么该机构名与前面紧邻的地名应当分开标注。

如：

/[L*中国*]/[O*北京大学*]/

/[L*中国*]/[L*广东*]/[O*深圳中学*]/

/[L*北京*]/[L*昌平*]/[O*十三陵抽水蓄能电站*]/

##### 5.2.4.3规则三

如果一个机构名的开头不是地名，那么当它前面邻接一个或多个地名时，只有其中与该机构名紧邻的那个地名需一起标注。例如：

/[O*上海同济大学*]/

/[L*中国*]/[O*上海同济大学*]/ 

/[O*湖北省武钢三中*]/

##### 5.2.4.4规则四

如果一个机构名本身以两个或两个以上并列的地名开头，则这些地名都要留在该机构名中。如果在它前面再出现其它地名时，一律同该机构名分开标注。但是如果上一级地名不能管辖下一级地名时，要把上一级地名标注在机构名内。

例如：

/[L*洛杉矶*]/[O*亚太法律中心*]/ 

/[L*香港*]/[O*中港贸易协会*]/ 

/[O*广东亚洲大酒店*]/

/[O*澳大利亚维多利亚投资公司上海办事处*]/*》*/，

/[O*澳大利亚维多利亚投资公司*]/*》*/

- 注："广东"与"亚洲、澳大利亚与维多利亚"都不属于上、下级管辖关系，所以要把上一级地名标注在机构名内。

##### 5.2.4.5更复杂的情况

在更复杂的情况下，我们可能无法判定某机构名究竟是以一个还是两个地名开头的。这时可按规则5.2.5和5.2.6来处理。

例如，*洛杉矶台北经济文化办事处*

究竟是A：*[L*洛杉矶*]/[O*台北经济文化办事处*]*

还是B：*[O*洛杉矶台北经济文化办事处*]*

这时，默认的标注方式是B(理由见5.2.8)。

##### 5.2.4.6地名概念比较模糊的情况

如果该地名比较模糊，而标注者又没有足够的知识来判断某机构名的开头是否是一个地名。就标注到一个比较明确的地名，

例如：*印度尼西亚莫巴蒂努山打腊航空公司*中的*莫巴蒂*·*努山打腊*不知道是不是地名。但至少知道一旦拿走了这个字符串，剩下的字符串已不构成专指性的地名。此时，按规则2.5的标注方式应是：

/[L*印度尼西亚*]/[O*莫巴蒂*·*努山打腊航空公司*]/

/[O*河北沙岭子电厂*]/

----*沙岭子*是一个乡镇的地名，河北和内蒙古都有一个沙岭子镇，地名的概念比较模糊，故标注在机构名内。

/*国际*/*著名*/*的*/[O*加拿大*B＋*H国际建筑师事务所*]/

##### 5.2.4.7紧邻的地名和机构名不构成修饰关系的情况

一个地名后紧邻一个机构名，但它们不构成修饰关系，则一律分开标注。

/*促进*/*了*/[L*中国*]/[O*东盟*]/*的*/*合作*/

/*在*/[L*日内瓦*]/[O*联合国*]/&*人*/*权*&/*会议*/*上*/

更典型的例子需借助上下文来判断，如：

/*促进*/*了*/[L*中国*]/[O*微软*]*的合作*/

/[O*中国微软*]/*即将*/*发布*/*新产品*/

- 注：如果标注者不能判断它们是不是修饰关系，则默认为分开标注，如：

/[L*中国*]/[O*微软*]/

/[O*美国众议院*]/

/[L*重庆*]/[O*长江救助打捞公司*]/ /[L*日本*]/[O*东京股市*]/ ----错误标注！

/[L*日本*]/[L*东京*]/*股市*/ ----正确标注。

/[L*美国*]/[L*华盛顿*]/[O*三普证券公司*]/ ----错误标注!

/[L*美国*]/[O*华盛顿三普证券公司*]/ ----正确标注。

/[L*华盛顿*]/[O*美国国务院*]/

/[L*瑞典*]/[O*斯德哥尔摩国际和平研究所*]/

#### 5.2.5会议、晚会、运动会等以会结尾的短语是事件，不作机构名标注

/*泛*/[L*美*]/*运动会*/

/[L*中国*]/[ord*第一届*]/*人工智能*/*大会*/ 

/[ord*第四届*]/[L*中*]/[L*法*]/*经济*/*研讨会*/

/[ord*第三届*]/[L*海峡两岸*]/*水利*/*科技*/*交流*/*研讨会*/

----以上几例为事件，不是机构名。

/[O*中国人工智能协会*]/

/[O*中国人工智能联合会*]/ ----为机构名。

当会议指议会(congress)或代表大会(chamberofdeputies)时，应视为机构名。但是要注意:虽然议会或代表大会是机构名，但是议会或代表大会中的某一次会议是一个事件，不是机构名。为了更明确的区分各种情况，我们用以下例子说明：

/*通报*/*了*/[O*八届政协*]/[ord*五次*]/*会议*/*的*/*各*/*项*/*安排*/

/[O*全国政协*]/[ord*八届*]/[[ord*五次*]/*会议*/*将*/*于*/

/*听取*/*和*/*审议*/[O*全国政协八届五次会议常务委员会*]/*报告*/

/*审议*/[ord*八届*]/[ord*五次*]/*会议*/*提案*/*审查*/*情况*/*的*/*报告*/

- 注：*八届五次会议*、*五次会议*是一个事件，不应标注为机构名。但是这次会议的组委会、委员会应视为机构名。例如：

/[O*八届全国人大*]/[ord*五次*]/*会议*/

/[O*政协九届一次会议*]/ --错误标注！

/[O*中国共产党第十五次全国代表大会*]/

/[O*九届人大*]/[ord*一次*]/*会议*/

/[O*中国全国人大*]/ 

/[O*中共十五大*]/ 

/*各级*/*人大*/*常委会*/ --不是专指，故不标。

/[O*中国科协*]/[ord*第五次*]/*全国代表大会*/

/[L*湖南省*]/[ord*六届*]/[ord*二次*]/*全*/*委*/*会议*/

/*向*/*同级*/*人民代表大会*/*或*/*人民代表大会常务委员会*/*提请*/*审议*/

- 注：*全国人民代表大会*和确指的省、市人民代表大会及其常委会、常务委员会需作为机构名标注。泛指的人大、中央银行、人民银行、&*农*/*发*/*行*&不作为机构名标注。

/[O*临澧县人大*]/*抓*/*村*/*级*/*财务监督*/*一瞥*/*（*/*监督*/*广角*/）

/*由于*/*各级*/*人大*/*代表*/*的*/*有效*/*监督*/*，*/[dat*去年*]/*以来*/*该*/*县*/*各*/*村*/*村*/*务*/*情况*/*出现*/*好转*/*，*/

- 注：在地名*国会大厦*中，*国会*不可作为机构名标注，否则就出现嵌套了。

/[L*国会大厦*]/

- 注："联合国大会"及其简称"联大"都是词表词，但不可整体标为O。如：

/[O*联合国*]*大会*/*于*/[dat*１９９２年*]/*批准*/*了*/*这*/*一*/*条约*/*。*

/[P*沈国放*]/[dat*２７日*]/*在*/[O*联*]*大*/*全体*/*会议*/*上*/*表示*/，

- 注：*会*也可能出现在一般的机构名中，如：

/[O*红十字协会*]/

#### 5.2.6用我们、我等代词修饰的机构名，只对机构名进行标注

/*我国*/[O*共产党*]/ 

/*我们*/[O*清华大学*]/

- 注：根据上下文是确指的某公司、单位名称的简称要标注为机构，否则不标注！但如果在公司、集团等词前面有本、我、该等字样时，此处的公司、集团不进行标注。其他特殊情况依据上下文进行标注。如：

/*凡*/*《*/[O*克罗伏特缓冲器股份有限公司*]/*股份*/*》*/*记名*/*的*/*持有*/*人*/*均*/*为*/*本*/*公司*/*股东*/*。*/

/*我*/*公司*/*出资*/*总额*/[mon*50万元*]/

/[O*港资陕西华懋实业公司*]/*总经理*/[P*商铭渔*]/*，*/*受*/[O*公司董事会*]/*委托*/*来到*/[O*咸阳市西北地勘局二一五医院*]/*看望*/[O*公司*]/*保安*/*员*/[P*韩玉刚*]/*，*/

#### 5.2.7大使馆和领事馆的标注

当大使馆(或领事馆或其它外交使团)所代表的国家和所在地区相连时，整体标为机构名。如：

/*后来*/*调*/*任*/[O*美国驻洪都拉斯大使馆*]/

当大使馆(或领事馆或其它外交使团)所代表的国家或所在地没有出现在上下文中，或者在描述范围内不连续，那么存在两种情况：

（1）大使馆所代表的国家和大使馆（领事馆）相连，此地名和大使馆一起标记 为机构名。如：

/*前往*/[L*香港*]/*的*/[O*洪都拉斯领事馆*]/

（2）大使馆所在地和大使馆（领事馆）相连，此地名应单独标记，整体不作为机构名。如：

/[L*美国*]/*在*/*通过*/*驻*/[L*金沙萨*]/*大使馆*/*和*/*其它*/*正常*/*管道*/

- 注：虽然*驻金沙萨大使馆*是一个连续的短语，但它的实际意思是*美国（或*X*国）驻金沙萨大使馆*，而不是什么*金沙萨（的）大使馆*。因此在这里*大使馆*不视为机构名。

#### 5.2.8生产厂家要标注为机构名，产品则不标

这里定义的产品范围较广，不仅包括生产厂家生产出来的产品（如自行车等），还包括计算出来的产品（如：股票指数）、媒体产品（如：电视节目）

/[O*道琼*]/*工业*/*平均*/*指数*/

----因为股票指数可以视为产品，那么*道琼*就可以视为生产厂家。

/[O*纳斯达克*]/*指数*/ ---原因同前。

/[O*太原刚玉*]/[dec*１０．５８１*]/ 

/[O*咸阳偏转*]/[dec*１６．１１２*]/ 

/[O*深华发Ａ*]/[dec*１５．６６３*]/ 

/[O*渝开发*A]/

#### 5.2.9报纸、广播电台、电视台和杂志的名字要标为机构名

新闻媒体（如：报纸、广播电台、电视台和杂志等）的名字要标为*O*，但报刊、电视栏目的名字不标。例如：

/[O*美国之音*]/*记者*/*表示*/

/[O*人民日报*]/*海外*/*版*/[ord*第三版*]/

/*《*/[O*泰晤士报*]/*》*/*援引*/*一个*/*国际*/*专家*/*委员会*/

/[O*中央电视台*]/*《*/*焦点*/*访谈*/*》*/*、*/*《*/*东方*/*时空*/*》*/*主持人*/

/[O*武汉电视台*]/*《*/*科技*/*之*/*光*/*》*/*栏目*/*的*/*《*/*科学家*/*，*/*您好*/*》*/*专栏*/ 

/[O*美国《科学》杂志*]/

/[O*美国探索电视网*]/

/*创办*/*《*/[O*深圳房地产快讯*]/*》*/ 

/*办*/*好*/*《*/[O*中外房地产导报*]/*》*/

#### 5.2.10特殊情况

***民族不作为机构名***

***泛指的*部队不作为机构名**

***政府不作为机构名***

***学术或商务会议(conference,meeting)不作为机构名***

***交易会不作为机构名***

***运动会不作为机构名***

***联赛不作为机构名***

#### 5.2.11特殊情况示例

/[L*中国*]/[L*天津*]/*出口*/*商品*/*交易会*/

/[L*中国*]/[O*天津出口商品交易会*]/ ----错误标注！

/[L*中国*]/*政府*/ ----*不把政府*标为机构名。

/[L*非洲*]/*维持*/*和平*/*部队*/ ----*不把部队*标为机构名。

/[L*中国*]/*公安*/*部门*/ ----*不把部门*标为机构名。

/[O*中国公安部门*]/ ----错误的标注！

- 注：标注并列的机构名（*O*）时，连接词和标点符号不进入标注范围。例如：

/[O*上海*]/*、*/[O*北京人类基因组研究中心*]/

/[P*贺国中*]/*分别*/*任*/[O*一*]/*、*/[O*四*]/*、*/[O*七团*]/*党代表*/

- 注：上述情况和标注并列的序数（*ord*）不同，连接词和标点符号是否进入标注范围取决于序数词所修饰的词语。例如：

/*获得*/*个人*/[ord*一、二、三等*]/*奖*/

/[ord*一*]/*、*/[ord*二*]/*、*/[ord*三*]/*产业*/

/*书店*/[ord*三、四层*]/

- 注：*中央*不作为机构名，但党中央*标为机构名。

/*在*/*中央*/*的*/*领导*/*下*/

/*以*/[P*胡锦涛*]/*同志*/*为*/*核心*/*的*/[O*党中央*]/*周围*/

#### 5.2.12地名和机构名容易混淆的情况

/[L*人民大会堂*]  ----地名。

/[O*五角大楼*]/*发言人*/*说*/*，*

/[O*白宫*]  ----机构名。

/[O*克里姆林宫*]/*表示*  ----机构名。

/*在*/[L*总统府*]/*分别*/*约见*/*了*/*多*/*位*/[O*国民党*]/*中*/*常委*/*检察官*/

- 注：*总统府*标注为L而不是*O*。这是因为有的国家有多处总统府，所以不能把它们视为国家或政府的唯一代表。

- 注：下面的例子中出现的类似单位名称的，因不是确指，而且是出现在各种条令、合同中，适合任何一个省、市、县的单位机构名称，所以不能作为一个机构名称标注为*O*。如：

/*本*/*合同*/*正本*/[int*三份*]/*，*/*出租*/*人*/*、*/*承租*/*人*/*、*/*市*/*公证处*/*各*/*执*/*一*/*份*/*。*/*副本*/*若干*/*份*/*，*/*报*/*市*/*经济*/*委员会*/*、*/*市*/*经济体制*/*改革*/*委员会*/*、*/*市*/*财政*/*局*/*、*/*劳动局*/*、*/*税务局*/*、*/*审计*/*局*/*、*/*工商*/*行政管理*/*局*/*、*/[O*中国人民银行*]/*市*/*分行*/*、*/[O*中国工商银行*]/*、*/*市*/*分行*/*等*/*有关*/*部门*/*备案*/ */*本*/*合同*/*在*/*履行*/*中*/*如*/*发生*/*争议*/*，*/*双方*/*应*/*协商*/*解决*/*；/*协商*/*不*/*成*/*时*/  /*任何*/*一方*/*均*/*可*/*向*/*工商*/*行政管理*/*局*/*合同*/*仲裁*/*委员会*/*申请*/*调解*/*或*/*仲裁*/

## 第六章 数字串标注总则

数字串（**Factoid**）包括时间表达式（**TIMEX**） 、数字表达式（ **NUMEX** ）、度量表达式（**MEASUREX**）和地址表达式（**ADDREX**）等**4**大类，***27***个小类，详见表**1-1**。标注数字串的一条重要原则就是：它的标记不得插入到词表词的内部（见**1.5.2.4**）。

### 6.1时间表达式

时间表达式（*TIMEX*）包括日期（*dat*）、时间（*tim*）和时段（*dur*）三小类。所有小于一天的时间都被定义为时间（*tim*），如秒，分，小时。一天或者大于一天的时间单位则属于日期（*dat*），如*天，日，星期，礼拜，月，季度，年，五年，十年，世纪*等。时段（dur）通常也使用日期和时间中的单位，如月、年、时、分*等。对此标注者要注意区分。

将日期、时间同时段区分开来有时是困难的，下面分别给出它们的定义。

#### 6.1.1日期（dat）和时间（tim）的定义

日期和时间在一维的时间坐标轴上有相对确定的位置。小于一天的时间都被定义为时间。一天或者大于一天的时间则属于日期。

/[tim*8*点30*分*]/

/[dat*今天*]/[tim*晚上*]/ ----*晚上*是词表词。

/[dat*昨天*]/[tim*夜里*]/ ----*昨天*和*夜里*都是词表词。

/[tim*昨夜*]/ ----*昨夜、昨晚*都是词表词，只能整体标*tim*。

/[dat*昨*]/[tim*晚*]/ ----错误的标注！

/[dat*春节*]/---在每一年中，是比较固定一天或几天。

/[dat*1999*年*]/---以*年*为单位，与别的年份相区别。

/*在*/"/[dat*六五*]/"/*中*/---以*五年*为单位，与别的*五年*相区别

- 注：严格地说，每一个*dat*或*tim*都占据了一个时间段，因此这里出现的*期间*和*中*，不能作为标注时段的理由。

/"/[dat*九五*]/"/计划

/[dat"*九五*"*初*]/

/*仅*/*"*[dat*八五*]/*"*/*期间*/*就*/*达*/[mon一百一十五亿元]/。

/[dat*下半年*]/---以*半年*为单位，与*上半年*相区别。

/[dat*二十世纪*]/---以一百年为单位，与别的*世纪*相区别。

/*为*/*庆祝*/[O*北京大学*]/*建*/*校*/[dat*１００周年*]/*，*/

/[dat*民国八十六年*]/

/[dat*民国六十年代*]/

/[dat*八十八年下半年*]/*及*/[dat*八十九年*]/*中央*/*统筹*/*分配*/*款*/*，*/ 

/[dat*公元二千年*]/

/[dat*今年九月*]/

/*"*/[O*迈特兴华*]/*"*/*杯*/[ord*首届*]/*全国*/*象棋*/*大师*/*赛*/*于*/[dat*今日*]/*收*/*秤*

/[dat*１９９７年下半年*]/*，*/ 

/*可*/*于*/[dat*农历年*]/*前*/*迁居*/*。*/ 

/[tim*第七十三分钟*]/

/[tim*中午*12*点*]/

/[tim*格林威治时间*5*时*59*分*]/----含有地名。

/[dat*第二天*]/[tim*一大早*]/*，*----*一大早*是词表词。

/*在*/[dat*今年暑期*]/*大学生*/*送*/*科技*/*下乡*/*活动*/*中*/，

/*大约*/[tim*七点*]/*到达*/*----大约*不标。

/[tim*晚上大约七点*]/*到达*/

- 注：*大约*被两个*tim*包围，分割不开，所以整体标上。这条标注规则遵照了ER-99和MET-2的标准。

- 注：事件戊戌变法、辛亥革命、甲午战争、五四运动等都是词表词，其中的日期不标注。但当戊戌、辛亥、五四单独出现时，应作为日期来标注。例如：

/*与*/*稍*/*后*/*的*/*辛亥革命*/*，*/*都*/*有*/*相通*/*的*/*地方*/，

/*在*/[L*香港*]/*回归*/[dat*周年*]/*前夕*/*和*/*"*/*七七事变*/*"*/*纪念日*/*，*/[dat*戊戌*]/*思潮*/*与*/*前此*/*的*/*洋务运动*/，

#### 6.1.2时段（dur）的定义

时段既可以长于一天，也可以短于一天。它不同于日期和时间，在一维的时间坐标轴上没有确定的位置。例如：

/[dur*三年*]/ 

/[dur*半年*]/

/[dur*四分之一个世纪*]/

/[dur*廿四个月*]/

/*时间*/*长*/*达*/[dur*六分钟*]/ 

/[dur*两个星期*]/

/[dur*一个月*]/*后*/

/*曾*/*在*/[dur*５、６年*]/*前*/*撰文*/*陈述*/

/*早产*/[dur*十二周*]/*左右*/

/*大水*/[dur*十天*]/*后*/*才*/*退*/*尽*/ /[dur*一至两年*]/

/[dur*一小时卅分钟*]/ /*这*/[dur*几天*]/

/[dur*卅天*]/*会期*/*只*/*开*/*了*/[dur*九天*]/

/*虽*/*经*/[dur*一整天*]/*磋商*/*，*----*一整天*不是词表词，但要标为*dur*。

与*/*洪水*/*奋战*/[dur*一天一夜*]/*，*----*一天一夜*也不是词表词。

时间表达式的标注细则详见第七章。

### 6.2数字表达式

数字表达式（*NUMEX*）包括百分数（*per*）、钱款（*mon*）、频度（*fre*）、整数（*int*）、分数（*fra*）、小数（*dec*）、序数（*ord*）、比率（*rat*）等8小类。

#### 6.2.1百分数（per）

/[per*百分之二十五*]/

/[per*百分之一点七*]/ ---虽然是小数，但要标作per。

/[per*六点五百分点*]/ /[per*五成*]/*以上*/ /[per*六折*]/

/[fra*百万分之八*]/  ----注意标的是*fra*而不是*per*。

/*大约／*[per5%]/ ----约数*大约*不进入标注。

6.2.2钱款（mon）

/[mon*四亿元台币*]/

/[mon*43.6亿美元*]/ 

/[mon*卅万元*]/

/[mon*四万五千块钱*]/ 

/[mon*四万五千元人民币*]/

/*只*/*增加*/*了*/[mon*几元钱*]/*的*/*成本*/

/*决不*/*乱*/*花*/*国家*/*的*/[mon*一分钱*]/。

- 注：同一笔钱的不同货币形式需分开标注。货币中的地名不标。

[mon*26万英镑*]/ (/[mon*43.6亿美元*]/)/

- 注：*约*是一个不确切的概念，故不标注。但*上*、*数*、*好*要和数字串捆绑在一起标注。但*近*作为特例，不与数词捆绑!!

/*约*/[mon*十万元*]/

/*大概*/*需要*/*花费*/[mon*上千万美元*]/*的*/*投资*/*和*/[dur*3*年*]/*左右*/*时间*/*，*/ 

/*多*/*收入*/[mon*好几十元*]/

#### 6.2.3频度（fre）

/[fre*数度*]/ /[fre*两次*]/ /[fre*26次*]/ /[fre*十多次*]/ /[fre*多次*]/

- 注：动量词次除了一次不标注以外，其余的全部标注为*fre*。

/[fre*一次次*]/

/[fre*再次*]/ /[fre*无数次*]/ /[fre*数次*]/

#### 6.2.4整数（int）

*int*标注的是数词和量词组合成数量词组。

/[int*卅七件*]/ /[int*一百卅项*]/ /[int*三种*]/

/[int*九个*]/*课室*/ /[int*几家*]/

/*后*/[int*几名*]/ /[int*十*]/*多*/*人*/ /[int*四条*]/*断层*/ /[int*五十户*]/ /[int*百余名*]/ /[int*上万*]/*人潮*/

/*"*/[int*双*]/[int*百*]/*"*/*方针*/，

- 注："双百方针"是词表词，由于文中"双百"用引号括起，而且它们是两个数字，所以要分别按数字串标注。类似情况还有词表词"五四运动"，这是个事件不标。但是如果文中日期"五四"被引号括起，就要单独标为：/"/[dat五四]/"/运动/。又如"六一儿童节、六一国际儿童节、六一节"都是词表词。由于"六一"和"儿童节"是同一个日期，即使在文中"六一"被引号括起，也可以整体标为dat，如：/[dat"六一"儿童节]/。

- 注：人次应标注为*mea*而不是*int*，例如：

/*近*/[dur*３年*]/*中*/*，*/*该*/*市*/*采取*/*多*/*形式*/*的*/*农技*/*培训*/*近*/[mea*万人次*]/，

- 注："*数词*+*强*"不一定表示序数，因此只单独标注数词为*int*。例如：

*/*在*/*这次*/*从*/[int*十六*]/*强*/*到*/*冠*/*、*/*亚军*/*的*/*一次性*/*竞猜*/*中*/*，*/*

/[O*宝钢*]/*为*/*跻身*/*世界*/[int*５００*]/*强*/*而*/*采取*/*的*/*重要*/*步骤*/*。*/

#### 6.2.5分数（fra）

/[fra*数倍*]/ 

/[fra*一半*]/ 

/[fra*千百倍*]/ 

/[fra*3/4]/

/[fra*四分之三*]/

/[fra*百万分之三百六十四*]/ *----*注意标记是*fra*而不是*per*。

/[fra*半个*]/ /[fra*4倍半*]/ *----*倍数是分数的一种表示，应标*fra*。

/[fra*4倍半*]/

/[fra*4.5倍*]/ ----*虽然* *4.5*是个小数，但不标*dec*。

/*有效*/*载*/*力*/*提高*/[fra*２至３倍*]/

注："过半数"是词表词，因此不作为分数fra标注。例如：*/*都*/*难以*/*获得*/*过半数*/*的*/[int*２０７张*]/*选票*/*，*/*

#### 6.2.6小数（dec）

/[dec*3.14]/

/[dec*三点一四*]/

/*看*/*了*/*那么*/*长*/*时间*/*的*/*电视*/*，*/*视力*/*依旧*/[dec*1*．*5*]/

/*我*/*有着*/*足以*/*令*/*我*/*自豪*/*的*/[dec*1*．*2*]/*视力*/

----视力的多少是一个量级，没有单位，故按数量标注整数或小数。/*并*/*以*/[dec*6139.69点*]/*收盘*/

/*以*/ [dec*33.8*]/*收盘*/ /*比重*/*：*/[dec*1.02*]/

#### 6.2.7序数（ord）

/[ord*第一任*]/

/[ord*第一期*]/ 

/[ord*十六楼*]/

/[ord*第三次*]/*世界大战*/

/[ord*首*]/*日*/*销售*/*欠佳*/ 

/[ord*第二*]/*故乡*/

/[ord*三等*]/*奖*/

/[ord*前*6*名*]/

/*地震烈度*/*不*/*超过*/[ord*8度*]/ 

/*这*/[ord*第二条*]/*尤为*/*重要*/ 

/*位居*/*金牌*/*榜*/[ord*第二名*]/

/[O*北京市*]/[ord*首家*]/*就业*/*与*/*创业*/*组合*/*市场*/ 

/[ord*1174号*]/*文件*/

/[ord*6*路*]/*汽车*/ /[ord*六年级*]/*学生*/

/[dat*今年*]/*读*/[ord*大三*]/

/*发展*/*第一产业*/* ----第一产业*是词表词。

/*发展*/[ord*第一*]*产业*/ ----错误的标注。

/*阵风*/[ord*五级*]

/[ord*一、二、三等*]/*奖*/。

/*他*/*亲手*/*接*/*治*/[L*墨西哥*]/[ord*首例*]/*艾滋病*/*患*/*儿*/

#### 6.2.8比率（rat）

/ [rat*一比廿五*]/

/*以*/[rat*０∶６*]/*失利*/

/*上*/*一*/*届*/*世界杯*/*赛*/*就*/*以*/[rat*１∶０*]/*胜过*/*。*/

/*最终*/*以*/[rat*三比三*]/*握手言和*/*。*/ /*用*/*原液*/*与*/*水*/*稀释*/[rat*1*∶*20*倍*]/*。*/

数字表达式的标注细则详见第八章。

### 6.3度量表达式

度量表达式（*MEASUREX*）包括年龄（*age*）、温度（*tem*）、角度（*ang*）、长度（*len*）、面积（*are*）、容积（*cap*）、重量（*wei*）、速度（*spe*）、加速度（acc）和其它（*mea*）等10小类。

#### 6.3.1年龄（age）

/[age*卅五岁*]/

/[age*廿一岁*]/ 

/[age*六十五岁*]/ 

/[age*34岁*]/ 

/[age*六十寿辰*]/

/[age*花甲*]/*老人*/ ----*花甲*是词表词。

/*如同*/[age*年过半百*]/*的*/*老*/*妇*/*。*/ ----*年过半百*是词表词。

/[P*李元*]/*、*/[P*卞德培*]/[int*两位*]/*先生*/*都*/*已*/[age*年逾古稀*]/。

#### 6.3.2温度（tem）

/*寒流*/*耍*/*酷*/*平地*/[tem*6℃*]/ 

/*才*/*会*/*微*/*升*/[tem*6.1℃*]/

/*但*/*平地*/*温度*/*还*/*会*/*下*/*探*/[tem*5℃*]/*左右*/

/*积温*/*高*/*（*/[tem*2800度*]/*）*/----注意！

/[tem*零下*5*到*6*摄氏度*]/

- 注：数字范围的标注方式详见7.1.1。

/*大约*/[tem*5~7℃*]/

/*低温*/*反而*/*只*/*有*/[tem*10℃*]/~/[tem*12℃*]/ 

/[tem*摄氏19*－*24度*]/

/[tem*摄氏*19*度*]/ -/[tem*24度*]/

#### 6.3.3角度（ang）

/*钝角*/*就*/*是*/*大于*/ [ang*90度*]/*的*/*角*/

/*并*/*将*/*卫星*/*定点*/*在*/[agn*东经*110.5*度*]/[L*赤道*]/*上空*/*。*/

/*震*/*中*/*位于*/[ang*北纬*30.5*度*]/*，*/ ----详见4.2.4.2

#### 6.3.4长度（len）

/*开掘*/*到*/ [len*一米六七*]/*深度*/*时*/ 

/*高*/ [len*五米*]/*宽*/ [len*一百米*]/ /[len*109×78厘米*]/

/[len*1纳米*]/=/[len*十的负九次方米*]/

/*应用*/*于*/*紧*/*固*/*件*/*直径*/*为*/[len*1*／*4″*]/*（*/[len6m]/*）*/

/*最高*/*速度*/*每*/*秒*/  [len*360米*]/

/*发生*/*每*/*秒*/*速度*/*达*/[len*四十二米*]/*的*/*大风*/*。*/

/[L*三峡*]/*截流*/*落差*/*在*/[len*0.7-0.8米*]/*之间*/*，*/

#### 6.3.5面积（are）

/[are*廿七公顷*]/*土地*/

/*占*/*地*/[are*六百多公顷*]/

/*兴建*/[are*五千坪*]/*大*/*的*/*厂房*/ /[are*七百余坪*]/

/*每*/*套*/*住宅*/*面积*/[are*140*－*160m2*]/*，*/ 

/[are*997万平方公里*]/

/*农田*/ [are*20万亩*]/

#### 6.3.6容积（cap）

/*运输量*/*为*/ [cap*34个立方*]/

/[cap*一两箩*]/*谷子*/

/*选定*/*的*/*设计*/*流量*/*是*/*每*/*秒*/[cap*1.4*万至*1.9万立方米*]/*。*/ 

/*工程*/*已*/*完成*/*土方*/[cap*２３００多万方*]/*，*/

/*全国*/*消费*/*了*/[cap*２５万升*]/*啤酒*/*。*/

#### 6.3.7重量（wei）

/[wei*九百至一千吨*]/

/[wei*零点三公克*/]

/[wei*三千二百英吨*]/

/*重*/*约*/[wei*五、六公斤*]/*的*/*鲤鱼*/

/[wei*十台斤*]/

/[wei*三点五公吨*]/

/*产量*/*达到*/ [wei*数千万吨*]/ /[wei*几万吨*]/

/[wei*二十万吨*]/*级*/*以上*/

- 注：ER-99把上例标为：[wei*二十万吨级*]/以上。

#### 6.3.8速度（spe）

/*最高*/*速度*/ [spe*360米每秒*]/

/*打印*/*速度*/*：*/[spe12cps]/

----"*cps*"表示"characterspersecond（每秒字符数）"。

#### 6.3.9加速度（acc）

/*抗震*/*能力*/*：*/*地面*/*水平*/*加速度*/*≤*/[acc*0.4m*／*s2*]/ /*地面*/*垂直*/*加速度*/*≤*/[acc*0.2m*／*s2*]/

#### 6.3.10其它度量表达式（mea）

除了上面提到的度量单位元之外，物理、化学及其它度量单位的统一标注为*mea*。/*额定*/*电压*/*至*/[mea*660V]/

/[mea*5.5瓦特*]/

/*参观*/*人数*/*达*/[mea*620万人次*]/ /*工资*/[mea*3500元*/*人*/*月*]/

/[mea*25元*/*公斤*]/

/*风*/*压*/*不*/*超过*/[mea*700Pa*]/*（*/*相当于*/*风速*/[spe*34m*／*s*]/*）*/*。*

/*迁移*/*到*/[mea*千兆比特*]/*的*/*能力*/*能够*/*降低*/*拥有*/*总*/*成本*/*的*/*管理*/*方案*/

/*这些*/*快速*/*以太*/*网*/*和*/[mea*千兆位*]/*以太*/*网*/*服务器*/

#### 6.4地址表达式

地址表达式（*ADDREX*）包括电子邮箱（*ema*）、电话（*pho*）、传真（*fax*）、电报挂号（*tel*）、邮政编码（*pos*）和网址（*www*）等6种。

#### 6.4.1电子邮箱（ema）

/[ema *exp@email.com.cn*]/

/[ema*cnhuang@msrchina.research.microsoft.com*]/

#### 6.4.2电话（pho）

在标注电话号码时，要把国际区号、国内区号、本地区号等作为一个整体标注。如果有分机号码也要一并标注。当有多个分机号码时，要分别标注。如：

*预约*/*订*/*位*/*电话*/[pho*九五一八六二八*]/ 

/*洽*/*询*/*电话*/[pho*二四九三一零二零*]/ 

/*订*/*席*/*专线*/[pho（*8610*）-78906617]/

/*查询*/*电话*/*是*/(/[pho*零三八六二一一零零转二五二*]/)/

/*查询*/*电话*/[pho*三六九九七二一转二三三一*]/*或*/[pho*二三三二*]/

/[pho*120*]/

/[pho*119*]/

#### 6.4.3传真（fax）

/*全国*/*客户*/*服务*/*传真*/*：*/[fax*010-58722727*]/

/*传真*/*号码*/:/[fax*86-10-66665555*]/

/*公司*/*传真*/*：*/[fax*86-10-66665555*]/

#### 6.4.4电报挂号（tel）

/[O*搜狐公司*]/*电报挂号*/*是*/*：*/[tel(8610)*62726666*]/ 

/*电报挂号*/*：*/[tel*86-10-66665555*]/

/*联系*/*电话*/*：*/[tel*86-10-66665555*]/

#### 6.4.5邮政编码（pos）

/[O*清华大学*]/*的*/*邮政编码*/*是*/*：*/ [pos*100080*]/

/[L*安徽*]/[L*阜阳*]*/*地区*/*的*/*邮政编码*/*是*/*：/[pos*233600*]/

#### 6.4.6网址（www）

/*活动*/*报名*/*网址*/*：*/[www http:www.acer.net/event/apply]/

/[O*蕃薯藤*]/*购物*/*网*/*（*/[www http:shopping.yam.com]/*）*/

## 第七章 时间表达式标注细则

**时间表达式（**TIMEX***）包括日期（***dat**）** 、时间（ tim ）和时段（ dur ）三小 **类。**

### 7.1日期（dat）

/[dat*明治三十九年*]/*（*/[dat*公元一九零六年*]/*）*/ 

/[dat*大正十四年*]/*（*/[dat*公元一九二五年*]/*）*/

/[dat*昭和二年*]/*（*/[dat*公元一九二七年*]/*）*/

/[dat*清*]/[dat*道光十四年*]/ 

/[dat*清*]/[dat*咸丰十一年*]/

/[dat*民国六十八年*]/*拆除*/*后*/*迁到*/[L*芦洲*]/*，*/[dat*八十一年*]/*间*/*又*/*扩建*/ 

/[dat*一九九九*]/

/[dat*一九九九年十二月三十号*]/ 

/[dat*公元*1990*年*4*月*22*日*]/ 

/[dat*旧石器时代*]/

/[dat*八十年代*]/

/[dat*下半年*]/

/[dat*1989财年*]/ ----注意!

/[dat*1989*财年第三季度*]/

/[dat*1990*上半财年*]/

/[dat*1991*财政年度*]/

/[dat*秋季*]/*报告*/ 

/[dat*第四季度*]/

/[dat*十五世纪*]/

/*努力*/*成为*/*一*/*名*/*高*/*素质*/*的*/[dat*跨世纪*]/*人才*/*。*/ 

/*值*/*此*/[dat*世纪之交*]/*的*/*时候*/*，*

/*走*/*向*/[dat*新世纪*]/*的*/[L*中国*]/*律师*/*业*/ 

/[dat*新旧世纪交替*]/*之际*/

/*黑色*/[dat*星期一*]----注意！

/[*Ｌ北京］*/*在*/[dat*23号*]/*发表*/*了*/*报告*/

- 注：数字串*23号*若不表示日期，则不标。

/[dat*五月上旬*]/  ----*上、中、下旬*要标注。

/*科技*/*之*/[dat*夏*]/  ----注意！

/[dat*夏*]/[dat*秋*]/*之间*/

/[dur*一年*]/*中*/*四季*/*分明*/  ----*四季*是词表词不标注。

/[L*南极*]/*的*/[dat*夏季*]/ 

/[L*中国*]/[dat*汉代*]/

/[dat*春节*]/  ----日期确定的节日要标注。

/[dat*肉孜节*]/

/[dat*开斋节*]/

/[dat*中秋*]/*时节*/

----注意*时节*不标。

/[L*美国*]/*的*/[dat*独立日*]/----美国独立日为每年7月4日。

/[dat*27年*]/*是*/*一个*/*多*/*事*/*的*/*年份*/

- 注：*27*年*可能表示时段，标注者须根据上下文注意区分。

/*现在*/*是*/[dat*26号*]/*，*/[dat*星期三*]/

----同一个时间的不同表达，要分开标注。

/*现在*/*是*/[dat*二月九号*]/*，*/[dat*农历大年初三*]/

/*大约*/[dat*五月四日*]/*----大约，大致，大概*等词不标。

/[dat*第二个十年*]/ /[dat*第二年３月*]/ 

/[dat*当年*9*月*]/ 

/[dat*今春*]/  ----*今春*不是词表词。

#### 7.1.1日期起讫表达式的标注

当日期表达式中有至、到和连结符－时，处在至、到和连结符－前后的日期表达式分别叫做前式和后式。如果前式和后式都是完整的日期表达式，则它们应分别进行*da*t标注；否则前、后式要整体标注为*dat*。

这条规则同样适合于其它各类数字串的标注，如：*tim*，*dur*，*int*，*tem*，*wei*，*mon*等。其一般表达式为：

/X+量词/到/X+量词/

/X+量词/至/X+量词/

/X+量词/－/X+量词/

/X+至+X+量词/

/X+到+X+量词/

/X+－+X+量词/

/X+、+X+量词/

例如：

/[dat*三月三日*]/*至*/[dat*三月卅一日*]/ 

/[dat*一月十八日*]/*到*/[dat*廿一日*]/ 

/[dat*三月三至廿一日*]/

/[dat*二月十八日*]/-/[dat*廿一日*]/ 

/*于*/[dat*今明两年*]/*陆续*/*推出*/*。*/ 

/[dat*民国五十五、五十六年*]/ 

/[dat*今明两天*]/

/[dat*今*]/*、*/[dat*明*]/[dur*两日*]/

/[dat*1980年*]*到*[dat*1990*年*1月*]/

- 注：含有比喻意义的今天、昨天、明天、今日、昨日、明日全不标注。

*/*"*/*一失足成千古恨*/*，*/*同学*/*们*/*，*/*看到*/*今天*/*的*/*我*/*，*/*你们*/*是否*/*感悟*/*到*/了*/*什么*/*？*/*"*/

/*尽管*/*炮火*/*已*/*消失*/*在*/*昨天*/*那*/*段*/*苦难*/*，*/

/[O"四方"集团*]/*的*/*明天*/*将*/*会*/*更加*/*灿烂*/*美好*/*。*/

- 注：当年、同年、当月等词语后有具体的日期时，要整体标注dat，如果当年、当月、同年等词语单独出现，而其前后有确指的日期时也要标注为dat，否则不作标注!当日、当天等词后有具体的时间时标注为dat，否则不作标注!如：

/[dat*当年７月*]/*在*/[L*莫斯科*]/*举行*/ 

/*然后*/*于*/[dat*同年８月*]/*奉调*/*回国*/*。*/

/[P*克林顿*]/*在*/[dat*当月１３日*]/*表示*/*，*/

/*那*/*是*/[dat*当天*]/[tim*中午１时*]/*的*/*汇率*/

/*发言人*/*于*/[dat*当日*]/[tim*午夜*]/*发表*/*声明*/

#### 7.1.2前、头、下+时段（dur）应整体标注为dat

/[dat*头两个礼拜*]/ 

/[dat*前３天*]/

/[dat*今年头四个月*]/

/*比*/[dat*上一年*]/*增长*/[per*１０．４％*]/*。*/

/*集中*/*研究*/*解决*/[dat*下半年*]/*纠风*/*工作*/*如何*/*突出*/*重点*/*，*/

/[dur*两周*]/*前*/

/[dat*1993年之初*]/ ----注意!

/[dat*公元之初*]/

#### 7.1.3当乾隆、康熙、道光等表示年代时标注为dat

当乾隆、康熙、道光*等表示年代时标注为*dat*，而当*乾隆、康熙、道光*等表示皇帝本人的名字时标为P。如：

/*最近*/*发现*/*一*/*张*/*在*/*农家*/*珍藏*/*的*/[dat*清代*]/[P*康熙*]/*、*/[P*雍正*]/*、*/[P*乾隆*]/*、*/[P*嘉庆*]/*、*/[P*道光*]/[int*五*]/*皇帝*/*诰封*/*圣旨*/[int*九道*]/*，*/ /*收藏*/*了*/*自*/[dat*清代*]/[dat*乾隆*]/*年间*/*至今*/*各个*/*历史*/*时期*/*的*/*鼻烟壶*/*艺术*/*珍品*/*，*/

#### 7.1.4朝代名的默认值为dat

当朝代名被上下文确认为国家名时标注*L*，否则默认为*dat*。如：

/*如果*/[P*刘伯温*]/*不是*/*一直*/*压抑*/*着*/*对*/[dat*元*]/*王朝*/*的*/*不满*/*，*/ 

/[dat*楚*]/*霸王*/[P*项羽*]/*带领*/[int*两万*]/*兵*/*将*/*，*/

/*只*/*带*/[dur*三天*]/*粮食*/*，*/*渡过*/[L*漳河*]/*去*/*与*/*强大*/*的*/[dat*秦*]/*兵*/*作战*/*。*/*结果*/*，*/[dat*楚*]/*军*/*大败*/[dat*秦*]/*军*/*。*/

/[dat*吴*]/*王*/[*P*夫差]/*战胜*/*了*/[dat*越*]/*王*/[P*勾践*]/，

/[dat*战国*]/*时*/[L*赵国*]/*良*/*相*/[P*蔺相如*]/*曾*/*为*/[L*赵国*]/*立*/*下*/*汗马功劳*/；

*[P*唐睢*]/*出使*/[L*秦国*]/*，*/*

《*/[L*水浒*]/*全传*/*》*/*描述*/*的*/*是*/[dat*北宋末年*]/*震撼*/[dat*宋*]/*室*/*江山*/*的*/[P*宋江*]*起义*/*。*/*

/*从*/*侧面*/*表现*/*了*/[dat*清*]/*政府*/*的*/*腐败*/*无能*/*，*/*激起*/*了*/*深*/*埋*/*在*/*人们*/*心底*

/*对*/*侵略者*/*的*/*敌视*/*和*/*对*/[dat*清*]/*政府*/*的*/*愤怒*/*，*/

/*但是*/*，*/*战争*/*最终*/*因*/[dat*清*]/*政府*/*的*/*妥协*/*、*/*投降*/*而*/*告*/*失败*/*。*/ /*无奈*/*夜郎自大*/*、*/*腐败*/*不堪*/*的*/*大*/[L*清国*]/*武器*/*太*/*落后*/*，*/

#### 7.1.5在"过去、今后、未来+时段（dur）"等修饰成分不进入标注范围

/*过去*/[dur*３年*]/*中*/*，*

/*将*/*在*/*未来*/[dur*几年*]/*内*/*出现*/

/*未来*/[dur*两天*]/*沿江*/*地区*/*仍*/*有*/*中*/*到*/*大雨*/*，*/

/[dat*今年七八月*]/*间*/

#### 7.1.6词表词近年来、近些年、近几年来、近几年、几年来等均不标注

按规定，词表词*近年来、近几年、近几年、几年来、多年来、近些年*等内部的*dat*、*tim*、*dur*都是不标的。但对非词表词则要分开标注。例如：

/[L*瑞士*]/*多年来*/*是*/[ord*第一次*]/*。*

/近几年/，/[L中]/[L菲]/关系/

/*近*/[dur*五年*]/*来*/

/*时至今日*/*仍*/*在*/*缓刑*/*期间*/*。*/*-----时至今日*是词表词。

### 7.2时间

/[tim*凌晨零时*]/

/[tim*清晨六时卅五分*]/*到*/[tim*四十分*]/ 

/[tim*凌晨二至四点*]/

/[tim*中午十二时*]/-/[tim*晚上九时*]/ 

/[tim*上午十一时*]/*至*/[tim*下午二时*]/ 

/[tim*第七十三分钟*]/

/[tim*格林威治时间*5*时*59*分*]/ ----含有地名。

/[tim*下午当地时间*5*时*59*分*]/

/[tim*九点整*]/*到达*/[L*北京站*]/

/[dat*九月十三日*]/*大约*/[tim*七点*]/*到达*/[L*北京*]/

- 注：这里*大约*不标。因为它虽被一个*dat*和一个*tim*包围，但是仍可以分割开。

### 7.3时段

/[dur*两个星期*]/ 

/[dur*一个月*]/*后*/

/*曾*/*在*/[dur*５、６年*]/*前*/*撰文*/*陈述*/

/*早产*/[dur*十二周*]/*左右*/

/*大水*/[dur*十天*]/*后*/*才*/*退*/*尽*/

/[dur*一至两年*]/ /[dur*一小时卅分钟*]/ 

/*这*/[dur*几天*]/

/[dur*卅天*]/*会期*/*只*/*开*/*了*/[dur*九天*]/ 

/[dur*10个月*]/

/*虽*/*经*/[dur*一整天*]/*磋商*/ ----*一整天*不是词表词，但要标为*dur*。

/*与*/*洪水*/*奋战*/[dur*一天一夜*]/*，*/ ----*一天一夜*也不是词表词。

*/*历经*/[dur*一二十年*]/*创建*/*了*/*庞大*/*的*/*船队*/*，*/*

/*让*/*我们*/*全家*/*人*/*感动*/*了*/[dur*好几天*]/

/*在*/*水门*/*丑闻*/ [dur*四分之一世纪*]/*时*/*发表*/*的*/*评论。*/

- 注：按照前面的原则：*水门*/*丑闻*/ [dur*四分之一世纪*]/*时*在时间坐标轴上有比较固定的位置，因此应当标为*dat*。但这种与事件（水门丑闻）相关的时间表达，在ER-99和MET-2中都是不标注的。这样，只有*四分之一世纪*需要标注为*dur*。

/[dur*十多年*]/

/[dur*几年*]/*以来*/

/*在*/[dur*半年*]/*时间*/*内*/----注意：*上半年*是*dat*。

/*在*/*总结*/[dur*14年*]/*改革开放*/*经验*/*的*/*基础*/*上*/

- 注：*14年*、*30*年*也可能表示dat。标注者要注意区分。*/*我们*/*在*/*美国*/*奔波*/*了*/[dur*30年*]/

/[dur*27年*]/*的*/*军旅*/*生涯*/

/*整整*/[dur*十五年*]/  ----*整整*不标。

/*大约／*[dur*十年*]*／的／时间*/ ----*大约*不标。

/[dur*十年*]/*来*/

/[dur*十几年*]/*的*/*时间*/ ----注意！

/[dur*十几年*]/*来*/ /[dur*十来年*]/ /[dur*数年*]/

/[dur*多年*]/ ----ER99不标。

#### 7.3.1一年都标为dur

/*新*/*的*/[dur*一年*]/*即将*/*开始*/

/*硬*/*是*/*在*/*地下室*/*干*/*了*/ [dur*一年*]/*的*/*公司*/

/[dur*一年*]/*创*/*产值*/*效益*/…/…/

/*聘金*/*为*/[dur*一年*]/ [mon*900万美元*]/*的*/*价码*/

- 注：*/*这*/*一年*/*、/*那*/*一年*/*中的一年不是确指不作标注。

*/*这*/*一年*/*，*/*企业*/*增收节支*/*达*/[mon*１１０万元*]/

/*在*/[O*北大*]/*就读*/*的*/*那*/*一年*/*，*/

- 注：整天、整日、整夜一律标注为*dur*，如：

/[dur*整天*]/*都*/*很*/*安静*/*，*/

/*还*/*东奔西走*/[dur*整日*]/*忙*/*个*/*不停*/*，*/

/*让*/*人*/[dur*整夜*]/*不得*/*入睡*/

- 注：当年、月、日、周等词修饰后面的工资、交易（销售）额、创汇等词语时，要作为时段（*dur*）来标注。如：

/[dur*月*]/*收入*/*就*/*在*/[mon*千元*]/*以上*/

/[dur*年*]/*交易额*/*近*/[mon*１０００亿元*]/*。*

/*这*/*一*/*工程*/[dur*日*]/*处理*/*污水*/[cap*２万立方米*]/*。*

#### 7.3.2一天的标注有以下三种情况，需区别对待：

##### 7.3.2.1"前一天"，不论其前面有没有定语修饰统统标注为dat（参见7.4.1）：

/[dat*前一天*]/*还*/*静止*/*的*/*电梯*/[dat*今天*]/*动*/*起来*/*了*/*，*/

/[L*香港*]/[O*恒生*]/*指数*/*比*/[dat*前一天*]/*下跌*/[int*４１２点*]/*，*/

/*这次*/[L*中*]/[L*韩*]/*足球*/*对抗赛*/*是*/*在*/[O*韩国队*]/*准备*/*赴*/[L*法*]/*出征*/

*世界杯*/*的*/[dat*前一天*]/*举行*/*的*/*，*/

##### 7.3.2.2"一天"的意思是指时间段（24小时），标注为dur：

/*每人*/*每月*/*接待*/*来访*/[dur*一天*]/*，*

/[P*汤*]/[P*尤*]/*杯*/[dur*一天*]/*不*/*拿*/*回来*/*，*/

/*仅*/[dat*５月３１日*]/[dur*一天*]/*，*/[L*莫斯科市*]/*税*/*警*/*就*/*查出*/[int*１６００个*]/*违法*/*经营者*/*。*/

/*青年人*/*辛苦*/*忙碌*/*了*/[dur*一天*]/*来*/*此*/*坐*/*坐*/*，*/ /*在*/[L*墨西哥*]/*最后*/[dur*一天*]/*的*/*访问*/*中*/*，*/

/*每*/*枚*/*多*/*赚*/[mon*７分钱*]/*，*/[dur*一天*]/*下来*/*能*/*多*/*收入*/[mon*好几十元*]/*。*

##### 7.3.2.3"一天"的意思相当于"有一天"，由于不是确指的日期所以什么也不标：

/*但愿*/*有一天*/*我们*/*轻松*/*地*/*说*/*：*/*消费*/*着*/*是*/*美丽*/*的*/*。*/

/[dat*１９９７年*]/*的*/*一天*/*，*/[P*吴佩民*]/*在*/*办公室*/*热情*/*接待*/*了*/*一个*/*素不相识*/*的*/*中年*/*妇女*/*。*/

/*一天*/[tim*下午*]/*，*/*记者*/*到*/*那*/*店*/*里*/*专门*/*拜访*/*了*/[P*佛朗科*]/*师傅*/*。*/

/*一天*/*，*/[P*列宁*]/*收到*/*一*/*封*/*前线*/*发*/*来*/*的*/*要求*/*支援*/*武器*/*和*/*服装*/*的*/*电报*/*。*/

/*一天*/*上*/*晚*/*自习*/*回来*/*，*/*有*/*一*/*条*/*狗*/*总*/*跟着*/*她*/*，*/

/*一天*/[tim*深夜*]/*，*/*一*/*人*/*酒后*/*拦截*/*过往*/*的*/*外地*/*车辆*/*，*/ /*一天*/*，*/*我*/*走过*/*他*/*的*/*门前*/*，*/

/*一天*/[tim*晚上*]/*，*/*新*/*上任*/*的*/[L*河北省*]/[O*栾城县委*]/*书记*/[dat*六月八日*]/*，*/

- 注："这/一天、那/一天"中的"一天"也非确指，所以也不标。

/*记住*/*这*/*一天*/*，*/*也是*/*表达*/*我*/*对*/[L*香港*]/*回归祖国*/*的*/*预祝*/*。*/ 

/[P*王龙雨*]/*从*/*上任*/*的*/*那*/*一天*/*起*/*，*/

### 7.4有关时间表达式的规则

#### 7.4.1前(后)+日期|时间要整体标注

/[dat*今年前五个月*]/ 

/[dat*前三天*]/

- 注：以下的标注是正确的：

/*在*/*上半时*/*结束*/*前*/[dur*1分钟*]/  ----*上半时*是词表词。

/*比赛*/*前*/[dur*十分钟*]/

/*在*/*上*/*半场*/[tim*第２７分钟*]/*时*/

#### 7.4.2反例——不应该标注的例子

刚才、最近、开始军备谈判以来、一会儿*等表示不确定时间的词语，不标。如果节日没有确定的时间，也不标。如：

/[L*印度*]/*国际*/*电影节*/

/[L*中国*]/*旅游年*/

#### 7.4.3特例

若两个短语属于不同的子类*dat*和*tim*，就需分开标注。

/[dat*2*月*12日*]/[tim*上午*8*点*]/

/[dat*星期一*]/[tim*8点*]/

- 注1：时间中的地名，如北京时间下午*5*点，在ER-99中不标注，而在NET-2中要标注。本规范按NET-2标注（参照前面的例子）。如果*dat*和*tim*分不开，就整体标注。

/[tim*北京时间*1997*年*2*月*9*号*19*点*28*分*]/

- 注2：*去年、昨天、今早*等词在MET-2中要标，在ER-99中不标。本规范只参照MET-2:

/[dat*去年上半年*]/ 

/[dat*今年夏天*]/

/[dat*今年三月一日*]/ 

/[dat*去年春夏之交*]/

/[dat*昨天*]/[tim*夜里*]/ ---*夜里*是词表词。

/[dat*今天*]/[tim*晚上*]/ ---*晚上*是词表词。

/[dat*今*]/[tim*早六点*]/ ---*今早*不是词表词。

/[tim*早上六点*]/ ---*早上*是词表词。

/[dat*５月份*]/*产品*/*出口*/*和*/*转口*/*总值*/*比*/[dat*去年同月*]/*下降*/[per*３．２％*]/*，*

/[dat*同一天*][tim*晚上*]/

/[dat*当日*]/[tim*下午*]/

- 注3：当日是词表词。如果在上下文中能确定*当日、当天*或*同一天*的具体日期时，就标注；否则不标。

/*每日*/[tim*上午１１时*]/*至*/[tim*深夜３时*]/ ----*深夜*是词表词。

/[tim*昨夜*/]/ ----*昨夜*是词表词。

/*每*/[dat*周四，二，一*]/

- 注：MET-2和ER-99对*早上六点*的标注是相同的。但ER-99认为*早上六点*与*今早六点*不同。原因可以从英语的表达来理解：前者是"6：00am"，后者是"6：00thismorning"。"thismorning"在ER-99中被视为"相对时间"，不标注。但在MET-2中，"相对时间"是要标的。本规范遵循MET-2。

/[dat*11月２４至２７日*]/

/[dat*3*月*15日*]/*至*/[dat*17日*]/ 

/[dat*1949年*]/-/[dat*1972年*]/

/[L*美国*]/*南北战争*/*（*/[dat*１８６１—１８６５年*]/*）*/*中*/

/*软件*/*最*/*长*/*的*/*寿命*/*为*/[dur*两到三年*]/*，*/

---清注意这里日期范围的标注方式。

*迄今*----*词表词不标-，MET-2标今*。

*今后*----*词表词不标-，MET-2标今。

*晨练*----*词表词中的*晨*不标。*-

*晚宴*----*词表词中的*晚*不标。*-

*春联*----词表词中的*春*不标。

*他们*/*的*/*今天*/*，*/*仿佛*/*就是*/*我们*/*的*/*明天*/*。*----泛指不标。

*参加*/*半决赛*----*半决赛*是词表词，*半*不标。

*双边*/*会谈*----*双边*是词表词，因此*双*不标。

#### 7.4.4每年和年不标注

本规则也适用于*月，天，小时*等其它时间单位。例如：

*/*年产值*/*…*/*…*/*

/*每年*/*创*/*产值*/*效益*/*…*/*…*/ 

/*每年*/*收入*/*…*/*…*/

## 第八章 数字表达式标注细则

数字表达式（*NUMEX*）包括百分数（*per*）、钱款（*mon*）、频度（*fre*）、整数（*int*）、分数（*fra*）、小数（*dec*）、序数（*ord*）、比率（*rat*）等8小类。以下是数字表达式的一些标注规则。

### 8.1如果整数、分数、小数、序数后面有量词，数量短语要整体标注

例如：

/[int*几千万盆*]/ 

/[int*几家*]/*工厂*/

/*一*/*家*/ [int*5*]/*人*/

/*一*/*家*/ [int*5口*]/*人*/

/*铁人*/[int*三项*]/*比赛*/*是*/*多*/*项目*/*的*/*综合*/*运动*/*，*

/*计算机*/*配置*/*：*/586/*以上*/*，*/[int*8兆*]/*内存*/*以上*/ 

/*打印*/*分辨率*/*：*/[mea*180dpi*]/

注：*dpi*表示每英寸的点数，所以作为*mea*标注。

/*评为*/*"*/[int*十*]/*星*/*级*/*乡镇*/*"*/*、*/*"*/[int*十*]/*星*/*级*/*支部*/*"*

### 8.2单纯的数字、词表词（包括俗语）中的数字都不作标注

如：

/*自然数*/5/*和*/6/*都是*/*整数*/

/*大家*/*听*/*口令*/*，*/*齐步走*/*，*/*一*/*二*/*一*/*，一*/*二*/*一*/*，*/*一*/*二*/*三*/*四*/*，/*

/*但是*/*卷子*/*上*/*的*/"/6/"/*还是*/*颠*/*巍巍*/*地*/*变成*/*了*/"/8/"/*。*/

/[L*瑞士*]/*、*/[L*西班牙*]/*、*/[L*比利时*]/*、*/[L*丹麦*]/[int*四*]/*国*/

/*并*/*促进*/*了*/[L*中*][L*美*]/*两国*/*的*/*交流*/*与*/*合作*/ ----*两国*是词表词。

/*并*/*促进*/*了*/[L*中*][L*美*]/[int*两*]*国*/*的*/*交流*/*与*/*合作*/*，* ----错误！

/*垄断*/*了*/[L*神奈川*]/*、*/[L*青森*]/*等*/[int*５*]/*县*/*的*/*交通*/*信号*/*维修*/*业务*/*。*

/[L*两岸*]/*经济*/*合作*/*和*/*直接*/*三通*/ ----*三通*是词表词。

/[L*两岸*]/*经济*/*合作*/*和*/*直接*/[int*三*]*通*/ ----错误!

/*到*/[L*云*]/[L*贵*]/[L*川*]/*的*/*大三线*/*地区*/*，----大三线*是词表词。

/*到*/[L*云*]/[L*贵*]/[L*川*]/*的*/*大*[int*三*]*线*/*地区*/*，----错误！

/*十年寒窗*/ ----*词表词中的十年*不标。

/*千载难逢*/ ----*词表词中的千载*不标。

/*十*/*年*/*九*/*旱*/*----非词表词。虚指的十年*不标。

/*眼*/*观*/*六*/*路*/*，耳*/*听*/*八*/*方*/ ----非词表词。虚指的六、八不标。

/*利*/*在*/*千秋*/*的*/*大事*/ ----*虚指的*"*千秋*"不标。

/*十*/*年*/*如*/*一*/*日*/ ---*-虚指的十年*和*一日*，不标。

/*万里*[L*长城*]/ ---*-虚指的万里*，不标。

/*三皇五帝*/----*三皇五帝*是词表词。

/*乌七八糟*/*的*/*东西*/*几乎*/*扫荡*/*殆尽*/*----乌七八糟*是词表词。

/*三大球*/*在*/*走*/*向*/*市场*/*时*/----*三大球*是词表词。

/*第二次世界大战*/*的*/*反法西斯*/*斗争*/----*第二次世界大战*是词表词。

/*三五成群*/*地*/*散落*/*着*/*警察*/*，*----*三五成群*是词表词。

- 注：*一会儿，一起，唯一，付之一炬，一流，千方百计，一分为二，一切，二娃*等词表词中的数字一律不标。

/*本职*/*创*/"/*一流*/"/*活动*/ /[int*亿万*]/*人民*/

/[int*百万*]/*民众*/

- 注：按照ER-99，*亿万、百万*不是一个抽象的数字，因此是要标注的。

### 8.3约、近是一个不确切概念，故不同后面的数字串一起标注

*上*、*数*、*几*、*好*则要和数字串捆绑在一起标注，而*约、近*作为特例，不与数词捆绑。

/*大约*/[int*12亿*]/*人口*/

/*约*/[int*四五千*]/*人*/*在*/[L*金边奥林匹克运动中心*]/*举行*/*集会*/*，*/ 

/*约*/[mon*十万元*]/

/*近*/[mon*千万元*]/

/*大概*/*需要*/*花费*/[mon*上千万美元*]/*的*/*投资*/*和*/[dur*3年*]/*左右*/*时间*/*，*/

/[O*省电力公司*]/*还*/*投资*/[mon*好几百万元*]/*，*/

/*多于*/[mon$90,000]/ /[mon*几百万新元*]/

/*统计*/*了*/[int*上百种*]/*数字*/*，*/

/*每年*/*都*/*要*/*花费*/*大量*/*外汇*/*引进*/[int*上百套*]/*系统*/

/*每年*/*搞*/[int*一两个*]/*工程*/*，*/

/*邀请*/*全国*/*近*/[int*百名*]/*书法*/*名家*/*，*/

/*近*/[int*千名*]/*员工*/

- 注：余、多本不应标注，但当它们位于量词前分割不开，所以整体加以标注。

/[mon*二十七万余元*]/ 

/[mon*五百多万元*]/

### 8.4钱款式中的地名

钱款表达式中的地名不论是单音节还是多音节的，Er-99和MET-2都不标，否则就形成嵌套。

如果货币字符串在文本中单独出现，字符串中没有数字修饰，那么双音节的地名要标注为*L*，单音节的地名不标注。例如非词表词*泰铢*中的*泰*不标。注意词表词*日元*、*美元*中的单音节的地名也不标。

/[mon*2000新元*]/

/[mon*2000新加坡元*]/

/*泰*/*铢*/*汇率*/*稳定*/*在*/[mon*３８铢*]/—/[mon*３９铢*]/*兑*/[mon*美元*]/*水平*/

/*纷纷*/*抛*/*出*/*日元*/*购*/*进*/[L*德国*]/*马克*/*，*/ 

/[L*菲律宾*]/*比索*/*对*/*美元*/*汇率*/*也*/*下跌*/*。*/

### 8.5钱款标注中的特例

MET-2规定：如果没有表示钱款的单位，则不标。ER-99则不然。本规范采用ER-99的规定。

/*这*/*辆*/*汽车*/*值*/[mon*20万*]/

/*卷标*/*上*/*的*/*价格*/*是*/ [mon*50*]/

/[O*纳斯达克*]/*跌*/ [int*140*]/

### 8.6频率的特例

/[fre*四年一度*]/ ----*四年一度*并非词表词，但整体标注为*fre*。

/[fre*一年一度*]/

----*一年一度*是词表词，整体标注为*fre*。

*/*主要*/*在*/*交流*/[fre*50Hz*]/*，/*额定*/*电压*/*至*/[mea*660V*]/

*---*交流电的频率是*50Hz*（赫兹）*,*即每秒变化*50*周。所以理应标成*fre*而不是*mea*。

/*频率*/*高*/*（*/[fre*30*－*60KHz*]/）

/*卫星*/*每年*/*发射*/[fre*６至７次*]/。

- 注：又一次、再一次全部标注为fre，但/*一次*/*又*/*一次*/例外，不作标注。

如：

/*此间*/*舆论*/[fre*又一次*]/*注意*/*到*/[L*亚*]/[L*非*]/*足球*/*的*/*差距*/ 

/*精湛*/*演技*/*，*/[fre*再一次*]/*赢得*/*了*/*首都*/*观众*/*的*/*由衷*/*赞赏*/

### 8.7名词方没有与之搭配的量词，因此可以和前面的数词直接结合

在我方、校方中的名词方没有与之搭配的量词，因此可以和前面的数词直接结合，如：

/[int*三方*]/*已*/*就*/[O*劳斯莱斯*]/*汽车*/*的*/*前景*/*达成*/*协定*/*，*/

### 8.8一相当于英语的冠词a，一般不标

一相当于英语的冠词a，一般不标，但一倍是例外,要标fra。例如：

/*一个*/*条件*/

/*一*/*座*/*城市*/

/*最大*/*的*/*企业*/*之一*/

/*荣立*/*一等功*/ ----*一等功*是词表词，不可标注。

/荣立*/[ord*一等*]*功*/----错误的标注！

/*获*/*县*/*政府*/*新技术*/*推广*/[ord*一、二等*]/*奖*/*。*/

/*我*/*的*/*收入*/*是*/*她*/*的*/[fra*一倍*]/ ----*一倍*是要标的。

### 8.9一（1）+量词不标注*int*

#### 8.9.1一+量词是词表词的情况

词表词一个、一种、一类、一批、一次、一套、一阵等作为数量短语不予切分，也不标注*int*。其中有些量词重迭形式也是词表词，如一个个、一天天，应保持其整词形式，而其它非词表词的数量短语和量词重迭形式都是要切开的。

/*一个*/*人*/

/*一个个*/*观众*/ 

/*一种*/*算法*/

/*一套*/*特种*/*邮票*/

/*一次*/*讨论*/

/*一*/*匹*/*黄骠马*/ 

/*一*/*栋*/*栋*/*楼房*/

/*一天天*/*暖和*/*起来*/

#### 8.9.2词表词一起、一块、一道、一面用作数量短语时应切开

词表词*一起、一块、一道、一面*有副词和其它词性的用法，但当它们用作数量短语时一律切开，而且不标注*int*。

/*一*/*块*/*石头*/

/*一*/*起*/*交通*/*事故*/

/*一*/*面*/*镜子*/

### 8.10一（1)"+物理单位元需按度量表达式标注

一（1)"+物理单位元（如米、公斤、摄氏度等）需按度量表达式（见6.3）标注。如：

/[wei*一公斤*]/*大米*/

/[mea*一度*]/*电*/

### 8.11分数词素半

#### 8.11.1词表词中的词素半不可标注为fra（分数)

词表词*如半价、半票、半饱、半身、半世、半辈子、上半时、下半场、半边*等，但不可把上述词表词中的词素*半*标注为*fra*（分数)。

/*上*/[fra*半*]/*场*/*比赛*/[L*中国*]*队*/*未进*/*一*/*球*/

/*下半场*/----词表词，是正确标注。

/*下*[fra*半*]*场*/----在词表词中插标*fra*是错误的。

/*目前*/*还*/*空闲*/*着*/[fra*一大半*]/*的*/*营业*/*面积*/*。*/

/*他们*/*之中*/*肯定*/*有*/[fra*一多半*]/*人*/*没有*/*球*/*票*/ 

/*有*/[fra*大半个*]/*篮球*/*场*/*那么*/*大*/

- 注：当半作为一个独立的词时要标注，标注的原则是：半+量词或名词时标注，半+动词或形容词时不作标注，如：

/*下半场*/*后*/[fra*半*]/*段*/

/*地处*/*偏僻*/[fra*半*]/*山区*/

/*部分*/*企业*/*停产*/*或*/*半*/*停产*/

/*而*/*处于*/*半*/*死亡*/*或*/*休眠*/*状态*/*，*/

/*干旱*/*半*/*干旱*/*地区*/*径流*/*造林*/*技术*/*、*/

#### 8.11.2以下的词表词不作为分数标注，而作为其它不同的数字串标注

/[dur*半年*]/ 

/[dur*半天*]/

/[tim|dur*半夜*]/ 

/[int|age*半百*]/

#### 8.11.3例外

半个西瓜中的半个，与四半中的半概念不一样，前一个半是指二分之一，后一个半是量词，所以标注也不同。

/[fra*半个*]/*西瓜*/

/[int*一个*]/*西瓜*/*分为*/[int*四半*]/

### 8.12序数词素首

#### 8.12.1词表词中的词素首不可标注为ord（序数)

词表中有许多词含有词素*首*，如*首创、首倡、首选、首发、首航、首飞、首演、首映、首战、首展、首席代表、首席科学家、首席执行官、首富、榜首、魁首、居首*等。但不可把词表词中的词素*首*单独作为*ord*（序数）来标注。

/*首席执行官*/----正确标注。

/[*ord首席*]*执行官*/----在词表词中插标*ord*是错误的。

#### 8.12.2具有首+量词结构的词表词或非词表词，应整体作为ord标注

具有"首+量词"结构的词表词有：*[ord*首届*]*，*[ord*首次*]*，*[ord*首批*]*，*[ord*首位*]*，[ord*首例*]等。

具有首+量词结构的非词表词，如：

/[L*北京市*]/[ord*首家*]/*就业*/*与*/*创业*/*组合*/*市场*/ 

/[P*满文军*]/*则*/*以*/*自己*/*的*/[ord*首张*]/*个人*/*专辑*/ 

/[dat*首日*]/*销售*/*欠佳*/

----这里首日不能作序数词来标注，应标注为日期*dat*。(详见7.1)。

- 注：头版、头条是词表词。它们和头一回统统标注为*ord*。如：

/*在*/[dat*４月１１日*]/*的*/*《*/[O*人民日报*]/*》*/[ord*头版*]/[ord*头条*]/*社论*/*位置*/*发表*/*出来*/*，*

/*由于*/*是*/[ord*头一回*]/*，*/*总*/*怕*/*有*/*个*/*闪失*/*，*

- 注："头"的上述标注不可类推到其它词组中，例如，

*上*/*半场*/*表现*/*不好*/*，*/*头*/[dur*１０分钟*]/*甚至*/*有些*/*拖泥带水*/*。*

*----*注：这里半场时词表词，但不标注为*fra*。

### 8.13序数词+量词结构，应整体作为ord标注

/[ord*第一期*]/ 

/[ord*第二*]/*故乡*/

/[ord*三等*]/*奖*/

/[dat*第一天*]/ *---*相对日期，标*dat,*而不是** [ord*第一*]/*天*。

/[dat*第二年*]/ *---*相对日期，标*dat,*而不是** [ord*第二*]/*年*。

/[O*波音*]/747 */*  ----*产品序号不标*。

/*地震烈度*/*不*/*超过*/[ord*8度*]//

/*这*/[ord*第二条*]/*尤为*/*重要*/*，*/ 

/*位居*/*金牌*/*榜*/[ord*第二名*]/*。*/

/*作为*/*大豆*/*行动*/*计划*/*的*/[ord*第二步*]/

/[ord*1174号*]/*文件*/

/[ord*6路*]/*汽车*/ /[ord*六年级*]/*学生*/

/[dat*今年*]/*读*/[ord*大三*]/

/*发展*/*第一产业*/  ----*第一产业*是词表词。

/*发展*/[ord*第一*]*产业*/ ----错误的标注。

/*阵风*/[ord*五级］*/

/*通过*/*大学*/*英语*/[ord*六级*]/

- 注：联赛中的A/组、B/组等不作为序数字串标注。如：

/*在*/[L*里昂*]/*进行*/*的*/*世界杯*/*Ｇ*/*组*/*比赛*/*中*/

- 注："甲级、甲/Ａ、乙/级、乙/Ａ"等不作为序数ord标注。如：

*/*当即*/*停止*/*该*/*场*/*比赛*/*主*/*裁判员*/*执法*/*全国*/*足球*/*甲*/*Ａ*/*联赛*/*；

/*获得*/[ord*前两名*]/*的*/*球队*/*晋级*/*甲*/*Ａ*/*行列*/*。*/

/[dat*１９９８年*]/*全国*/[O*男篮*]/*甲*/*Ｂ*/*联赛*/

/*判处*/*以*/[P*东条英机*]/*为首*/*的*/[int*７名*]/*甲级*/*战犯*/*死刑*/*。*/

### 8.14仅当形容词前表示比赛名次时才和后面的序数结构一起标注

仅当形容词前表示比赛名次，如前*6*名、前四（指前四名）时，才和后面的序数结构一起标注为*ord*。其余的情况如前两次、前三组、前三场、前两项等，前都不得进入被标注的数字表达式。

/*获得*/[ord*前十名*]/*的*/*是*/*：*/*在*/*前*/[int*两轮*]/*小组*/*赛*/*中*/

/*列*/*前*/[int*两位*]/*的*/*是*/[O*澳大利亚队*]/*和*/[O*日本队*]/*。*

### 8.15文本中表示标号的数字不标

规范、条例中的条款标号，包括一、二、三、Ⅰ、Ⅱ、Ⅲ、1，2，3、第一条、第二条、第三条等，一律不予标注。只有当这些条款被正文引用时，才作为序号ord被标注。例如：

/*第二*/*，*/*制定*/*必要*/*的*/*行规*/*、*/*行约*/*，*/*共同*/*规范*/*，*/*共同*/*遵守*/*，*/

/*一*/*无*/*资金*/*，*/*二*/*无*/*场地*/

/*一*/*靠*/*政策*/*调动*/*农民*/*的*/*积极性；*/ /*二*/*靠*/*科技；*/

/*一*/*是*/*继续*/*加强*/*农业；*/

/*二*/*是*/*采取*/*措施*/*稳定*/*物价*/*，*/*抑制*/*通货膨胀；*/ 

/*１*/*．*/*自卑*/*的*/*羞耻*/*感*/*。*/

/*２*/*．*/*依赖*/*的*/*恐惧*/*感*/*。*/

/*（１）*/*加强*/*爱国主义*/*的*/*宣传*/*教育。*/

/*（２）*/*加强*/*正确*/*的*/*理想*/*、*/*信念*/*、*/*人生观*/*、*/*价值观*/*的*/*宣传*/*教育*/*。*

"*第*+*数词*+*条*"视为词表词，但作为文中陈述的标号时不标注*ord*。仅当其在文中被引用时才作为*ord*标注。例如：

/*第一条*/*、*/*消费者*/*永远*/*是*/*对*/*的*/*；*

/*第二条*/*、*/*如果*/*消费者*/*真*/*的*/*错*/*了*/*，*/*清*/*参照*/[ord*第一条*]/*。*/

- 注：当上述数字表示等级序号时，则要标注为*ord*。例如：

*污秽*/*等级*/*：*/[ord*Ⅰ*]/*、*/[ord*Ⅱ*]/*、*/[ord*Ⅲ*]/*、*/[ord*Ⅳ*]/*。*

### 8.16人名、地名、机构名中的数字，不单独标注int

/[P*佐腾一郎*]/

/[L*梅竹蹊六十七号茶花庄*]/ 

/[O*子弟一中*]/

/[O*三明市*]/

/*任*/*队长*/*的*/ [O*1205钻井队*]/

### 8.17外文字符串的标注

由于外文的词与词之间都有空格作为分隔符，因此无需再去切分，只在标点符号的前后加切分标记。遇到字母词、名称缩写等情况也不作切分，如：/COM/经济/（网络经济）、/E/产品/（电子产品）、/卡拉/OK/等。

/Good morning/  ,/everyone/./

/*最近*/*引进*/*一*/*台*/JT-ESWL-*Ⅲ*/*型*/*体*/*外*/*震波*/*粉碎*/*肾结石*/*机*/*，*/

"*/[L *ZHONG* HUA *REN* MIN *GONG* HE*GUO]/"/*，*/*这是*/[L*中华人民共和国*]/*的*/*汉语拼音*/*。*/*

"*/Brother/*，*/I *love* you *all* the *time/*，*/  Thank *you* very *much/*！*/"/ "/Happy *birthday* to*you/*！*/"

/Dip *one* end *of* a *straw* in *the* solution/./Blow *gently* through *the* straw/./ */A* soap *bubble* forms/./What *happens* when *you* keep *on* blowing/?/

/The *bubble* bursts *because* the *pressure* inside *the* bubble *is* more *than* the*pressure *outside* the*bubble/./

### 8.18数学公式和机型标号均作为一个整体来切分和标注

例如：

/*△*S/=/[len*12*（*S1*＋*S2*）*mm*]/

/*IEC298*．*265*．*129*．*694. *420*．*56.* 529*．*932*/

/*GB3804.* *3906*．*11022*/

/IEC60129A2/*（*/[dat*1996*]/）*UES*－*K3*／*2/ /UEMC40K8U*／*1*/ */1* V/FJ220001R2/

/*SFL12*／*17.5*/IVD *P575303RI/  /S*FL24A/IVDP5753/O2RI/

## 第九章 分词歧义消解细则

本章中的歧义切分实例是从微软亚洲研究院237万词训练语料、10万词测试语料和20万词散页语料中抽取出来的。这些歧义字段可粗分为交集型歧义(OAS)和组合型歧义(CAS)两大类。交集型歧义又包含用正反向最大匹配（ＭＭ）算法侦查不到的所谓隐藏的CAS。下面就分别介绍不同歧义字段的消解规则。

### 9.1交集型歧义字段（OAS）

#### 9.1.1交集型歧义字段示例

由于交集型歧义字段的例子太多，不便穷举，所以下面只列举少量实例供参考。

（１）/矛头/所/指/正是/以/包/代/管/、/负/盈/不/负/亏/、/

（２）/[L四川]/一/私营企业/家/向/下岗/女工/捐款/

（３）/柚木/购/进/后/市场价格/大/跌/，/

（４）/图/为/[O保险公司]/向/受灾/企业/赔/付/现场/

（５）/地方政府/亟需/在/加强/压/锭/监管/力度/方面/下功夫/，/

（６）/与/厂/内/存留/的/旧/纱/机/一并/销毁/。/

（７）经/请示/，/自行/将/本/厂/经/改造/的/应/压缩/设备

（８）擅自/新/增/棉纺/生产能力/，

（９）/有人/钻/政策/空子/、/骗/财政补贴/。

（１０）日益/猖獗/的/走私/犯罪/活动/，

（１１）/他/在/教务/活动/中/积极/研究/、

（１２）对/全/山/的/商业/网点/和/摊/区/重新/进行/了/规划/和/建设/。

（１３）/加强/各级/领导班子/建设/。

（１４）全体/员工/开展/了/"/人家/学/我们/，/我们/怎么办/"/的/大/讨论/，

（１５）/[O欧佩克]/提高/原油/配额/和/暖冬/等/因素/影响/，

（１６）/保护/国家/和/人民群众/的/生命/财产/安全/。

（１７）/以/维护/民族团结/为/己任/，

（１８）/各级/领导干部/要/站/在/党/和/国家/全局/的/高度/，

（１９）/只有/坚持/解放/思想/、/实事求是/的/思想/路线/，

（２０）/呈现/了/"/部队/添/战斗力/，/企业/增/生产力/，

（２１）/共建/双方/通过/自上而下/层/层/签约/，

（２２）/电力/部门/还/专门/建立/了/正规/的/转业军人/业务/培训/机制/，

（２３）/这/条/线/不/停电/，/官兵/跳伞/太/危险/了/。

（２４）如同/[L华中]/电网/强大/的/发电机/群/按照/同一/频率/转动/一样/，

（２５）通过/举办/一些/全/集团/参与/的/拥军/活动/，

（２６）/在/全国/工业/[ord５００]/强/中/名列前茅/的/大型/企业集团/。

（２７）/本/次/检测/中/性能/系数/最高/者/。

（２８）/不是/主张/所有/的/会议/都/开/成/电视电话/会议/。

#### 9.1.2隐藏的交集型歧义字段

隐藏的交集型歧义字段是指那些用正、反向最大匹配（ＭＭ）算法无法侦查到的交集型歧义字段。

注：以下例句中，双百分号右面为改正后的切分。

（1）/[L新疆]/经济/社会/发展/一定/会展/现出/越来越/美好/的/前景/

/[L新疆]/经济/社会/发展/一定/会/展现/出/越来越/美好/的/前景/

（2）/成立/了/专/司空/中和/地面/服务/质量/监管/的/服务/质量/督察/办公室/，/

/成立/了/专/司/空中/和/地面/服务/质量/监管/的/服务/质量/督察/办公室/，/

（4）/其内/容或/规则/已/译/成/[int１５]/国/语言/，/

/其/内容/或/规则/已/译/成/[int１５]/国/语言/，/

（5）/这/一发/现有/可能/加速/艾滋病/新药/和/疫苗/的/研制/。/

这/一/发现/有/可能/加速/艾滋病/新药/和/疫苗/的/研制/。/

（6）/[L韩国]/对/日出/口中/，/

/[L韩国]/对/[L日]/出口/中/，/

（7）/恰/在/此时/，/奉/党委/派赴/[O共产国际]/工作/的/[P张太雷]/于/[dat８月]/回国/

/恰/在/此时/，/奉/党/委派/赴/[O共产国际]/工作/的/[P张太雷]/于/[dat８月]/回国

（8）/站/在建/设有/[L中国]/特色/社会主义/全局/

/站/在/建设/有/[L中国]/特色/社会主义/全局/

（9）/金融/危机/就/可能/会演/变为/经济危机/

金融/危机/就/可能/会/演变/为/经济危机/

（10）/如/少数/司机/在/东侧/门楼/外道/路上/违章/占/道/停车/，/

如/少数/司机/在/东侧/门楼/外/道路/上/违章/占/道/停车/，/

（11）/有关/部门/要/下决心/下力/气管/好/电子/游戏/室/。/

有关/部门/要/下决心/下/力气/管/好/电子/游戏/室/。/

（12）/相近/似的/设施/化/保护/菜地/面积/达/[are１３００万亩]/；/

/相/近似/的/设施/化/保护/菜地/面积/达/[are１３００万亩]/；/

（13）/表明/了/财政部/门对/落实/科教/兴/国/战略/采取/的/实际/行动/。/

/表明/了/财政/部门/对/落实/科教/兴/国/战略/采取/的/实际/行动/。/

（14）儿时/站/在家/门口/向/四面/望/，/

儿时/站/在/家门/口/向/四面/望/，/

（15）/就/不可能/正确/地理/解和/执行/党/的/路线/方针/政策/，/

/就/不可能/正确/地/理解/和/执行/党/的/路线/方针/政策/，/

（16）/使得/高校/中原/有的/个别/的/知识/物化/行为/迅速/扩展/为/一种/专门/职能/。/

/使得/高校/中/原有/的/个别/的/知识/物化/行为/迅速/扩展/为/一种/专门/职能/。/

（17）/这是/大都/市里/的/一个/皮货/修理/店/，/

/这是/大/都市/里/的/一个/皮货/修理/店/，/

（18）/需/招集/体制/女性/业务员/[int四名]/，/

/需/招/集体/制/女性/业务员/[int四名]/，/

（19）/连同/应/交费/用以/划/支票/寄/还/。/

/连同/应/交/费用/以/划/支票/寄/还/。/

（20）/我们/一口气/跑/到家/门口/的/一/棵/大树/前/，/

/我们/一口气/跑/到/家门/口/的/一/棵/大树/前/，/

（21）/"/唉/，/要是/好/好/复习/，/可不/会考/得/这样/糟/。/"/

"/唉/，/要是/好/好/复习/，/可/不会/考/得/这样/糟/。/"/

（22）/特制/定本/规定/。/

/特/制定/本/规定/。/

（23）/股东/会所/议事/项/作/成/会议/记录/，/

/股东/会/所/议/事项/作/成/会议/记录/，/

### 9.2组合型歧义字段(CAS)

组合型歧义字段在真实文本中大量出现，有的是比较常见的，有的是非常罕见的。尤其是有的CAS即使根据上下文也很难判断其正确切分，如正在、就是、还是、只有、只是、一道、一起等等。因此有必要针对那些高频的CAS逐条加以说明。

#### 9.2.1常见的组合型歧义字段

下面对一些常见的组合型歧义字段加以解释。

##### 9.2.1.1数词一和量词组成的CAS

词表词一个、一种、一类、一批、一次、一套、一阵等作为数量短语不予切分，也不标注int。其中有些量词重迭形式也是词表词，如一个个、一天天，应保持其整词形式，而其它非词表词的数量短语和量词重迭形式都是要切开的。（详见8.9）

/*一个*/*人*/

/*一个个*/*观众*/

/*一天天*/*暖和*/*起来*/

/*一套*/*特种*/*邮票*/

/*一次*/*讨论*/

/*一*/*匹*/*黄骠马*/

/*一*/*栋*/*栋*/*楼房*/

词表词一起、一道、一样、一手、一面、一口、一头、一气等既可以用作连词、副词、名词或形容词等，又可以切分开来成为数量短语。但像一套这样的词表词，除了数量短语的用法以外，不再有其它用法，因此不存在切分问题。词表词有一套是有本事的意思时，也不切分。这类词的切分问题只能逐个加以描述。

##### 9.2.1.2动量词次与频率int的标注

动量词中只有*次*被标注为频率*fre*，如[fre*再次*]*、*[fre*数次*]*、*[fre*一次次*]*、*[fre*无数次*]*、*[fre*好几次*]*，而*遍、回、趟*不标注为频率，一*/*遍、一*/*回、一*/*趟、一次（词表词不切分）、一*/*遍*/*又*/*一*/*遍、一*/*回*/*又*/*一*/*回、一*/*趟*/*又*/*一*/*趟，一次*/*又*/*一次*也不标注为*fre*。这条规则的理由如下：

(1)遍表达的是动作从开始到结束的全过程；次、回描写动作的重复；趟只用于表示行走意义的动词。*去一趟*可以说成*去一次*、*去一回*，但*做一次*、*做一回*不能说成*做一趟*。

*遍、次、回*有时可通用，如*你再唱一遍*，可以说成*你再唱一次*或*你再唱一回*而意思不变。但单纯表示动作数量时，只用*次*，不用*遍*，如*他表示了多次*、*敌人的三次进攻都被击退了*。

*次*与*回*区别在于，*次*既用于书面语又用于口语；*回*只用于口语。如*多次、数次*等带文言色彩的短语，就不能说成*多回、数回*。

(2)*这本书我看了一遍*，是指从书的开头到末尾的全过程。*这本书我看了一次*，着重指看的次数，不指看的全过程。

##### 9.2.1.3一(1)＋物理单位元量词构成度量表达式

当一(1)后面是长度、重量等物理单位元时应分别按度量表达式标注为*len*，*wei*，如[*len*一米*]*、*[wei*1*公斤*]（见8.10）。

#### 9.2.2CAS示例

下面是一些常见CAS的切分规则和示例。

（1）人为作形容词时不切分。

（1a）而是/深究/灾难/中/的/人为/因素/。（1b）以/人/为/本/

（2）为人：

（2a）也/包括/最/基本/的/为人/处事/的/行为/准则/

（2b）/始终/主宰/着/他/的/为人/之/道/和/为/艺/之/方/。（2c）/我/把/不大/为/人/所/知/的/一些/往事/写/下来/，

（3）一起：作名词和副词使用时不切分，作为"数+量词"时切分。（3a）/和/市民/一起/聊天/，/听取/群众/反映/。

（3b）/[dat４月１７日]/发生/在/[L北京]/[L海淀区]/[L阜石路]/的/一/起/车祸/，/

（4）一点:形容词，意思是少许，不切分。但作为数量短语时要切开。（4a）/文/中/还有/一点/小/差误/，/也/顺便/提/提/。/

（4b）/都/清楚/地/意识到/了/这/一/点/，/

（5）一道：作副词使用时不切分；作为"数+量词"时切分。

（5a）/而且/要求/未来/的/丈夫/同/她/一道/挑起/照顾/[P穆]/大爷/的/担子/。

（5b）/已/成为/百里/油田/的/一/道/风景/线/。

（5c）/在/我/的/前额/刻下/了/一/道/道/弯曲/的/青春/印记/。

（6）一面：作名词和副词使用时不切分，作为"数+量词"时切分。

（6a）/虽然/在/现代汉语/里/含有/贬义/，/但/其/积极/的/一面/应该/肯定/。

（6b）/一面/学习/，/一面/实践/，/贯彻/到/筹组/[L澳门特别行政区]/的/工作/中/去/。（6c）/爱/是/一/面/辽阔/光滑/的/回音壁/，/微小/的/爱/意/反复/回响/着/，

（7）一口：作形容词和副词使用时不切分，作为"数+量词"时切分。（7a）/[P崔]/又/一口/回绝/并/与/其/发生/争吵/。

（7b）要不/则/是/一/脸/匪/相/或者/一口/痞/气/，

（7c）不由得/倒/吸/了/一/口/冷气/打/了/一个/寒战/，

（7d）一/口/大/锅/解决/了/[int两家]/的/再就业/难题/。

（8）一手既有名词和副词的用时，又有"数+量词"的用法，但在文本中一律不予区分。

（8a）所有/的/为/官/为/政/者/都/能/写/一手/好/文章/，（8b）/整个/事件/是/他/一手/策划/的/，

（8c）/他/一手/划水/，/一手/搂/着/女/青年/游/向/岸边/。

（9）一头：作名词和副词使用时不切分，作为"数+量词"时切分。（9a）/二来/街道/一头/联/着/片/内/的/企业单位/，/一头/联/着/居民/，

（9b）/此后/，/他/一头/钻进/常年/云雾/缭绕/的/云雾山/，/拜访/民间/郎中/，（9c）/一/头/经过/救助/已/恢复/健康/的/灰/鲸/『/[P杰杰]/』/

（10）一路：作名词和副词使用时不切分；作为"数+量词"时切分。

（10a）有时/公共汽车/挤/不/上/，/干脆/快步/当/车/一路/小跑/。

（10b）我们/一路/攀登/来到/[P王永祥]/简陋/的/护林/小屋/。

（10c）另/一/路/是/探索/[L火星]/、/[L木星]/等/星球/。

（11）一下：用作副词和数量词使用时不切分；当一作副词下作动词时要切分。

（11a）/只要/通融/一下/，/既/能/得到/一/笔/大钱/，/又/能/保持/友情/。

（11b）/相互/拍打/一下/：/"/你/猜/[rat几比几]/？/"

（11c）/书包/斜/背/在/肩/上/，/带子/太/长/，/随着/步子/一/上/一/下/跳跃/着/拍打/在/屁股/上/。

（12）一片：作形容词使用时不切分；作数量短语时切分。

（12a）/台上/台/下/那/一片/亲切/和谐/的/气氛/，

（12b）/融入/一片/[dat夏日]/的/浓绿/之中

（12c）/地板/上/看/不/到/一/片/碎/纸屑/。

（12d）/宽宽大大/的/粽/叶/，/她/总/要/一/片/片/洗/净/。

（13）一则：作副词使用时不切分；作数量短语时切分。

（13a）/一则/表达/对/同乡/画/马/大师/[P徐悲鸿]/的/敬仰/，/二/则/愿/家乡/建设/如/骏马/奔腾/一日千里/。

（13b）/[L法国]/报纸/刊/出/一/则/特写/，

（14）不见：是动词见的否定形式不切分。当它同前面的动词形成V/*得*/*见*、*V/*不*/*见*的*

可能式动补结构时，要切分。类似的可能式动补结构还有*V/*得*/*下去*/*、*V/*不*/*下去*/*，

*V/*得*/*来*/*、*V/*不*/*来*，* *V/*得*/*起、*V/*不*/*起*，** V/*得*/*了*/*、*V/*不*/*了*/*，*V/*得*/*成*/*、*V/*不*/*成*/*，*长*/*得*/*大*/*、长*/*不*/*大*/*等*。

（14a）/全/都是/"/不见/兔子/不/撒/鹰/"/。

（14b）/人武部/就/看/不/见/一/盏/长明灯/，

（15）不对：作形容词表示不正确时不切分；如果对作为介词，就要切开。

（15a）父母/这么/想/当然/不对/，/可/也/不能/全/怪/他们/的/愚钝/和/落后/。

（15b）/中国/主张/和平/的/外交/政策/，/中国/不/对/任何/国家/构成/威胁/。

（16）不等：作形容词表示不相同时不切分；如果等作为动词，就要切开。

（16a）/按照/用户/要求/生产/大小/不等/的/编织/塑料/袋/，

（16b）不/等/妻子/说/什么/，/他/自己/悄悄/地/找/开/了/出路/。

（16c）/时间/不/等/人/

（17）不下：表示不少于时不切；作为动词下的否定式和可能式动补结构（见14），就要切开。

（17a）/每天/她/经手/的/业务/不下/[int百笔]/，

（17b）/架子/还/放/不/下/，/面子/还/丢/不/开/，

（17c）/刑/不/上/大夫/，/礼/不/下/庶人/

（17d）/[L俄罗斯]/整个/国家/开支/居/高/不/下/，

（18）不成：作动词、形容词和助词使用时不切分；当它作为可能式动补结构（见14）时，一律切开。

（18a）/难道/自己/这/一辈子/就/这么/过/不成/？

（18b）/毛虾/已/不成/汛/，

（18c）攀/"/亲/"/不成/反/折本/，

（18d）/往往/是/有/点/而/形/不/成/网/，

（19）上下：用作动词时一律切开，如"上/下/火车"；用作名词（包括并列意义）时则不切，如"上下/两册"。

（19a）/经过/上下/的/共同/努力/，

（19b）/上/下/车/、/船/，/须/待/车/、/船/停/稳/后/先/下/客/后/上/客/，

（20）从前：作时间名词时不切分；如果从作介词前作方位词，就要切开。

（20a）/有的/是/从前/在/队/中/当/板凳/球员/，

（20b）/导致/美元/对/马克/的/汇价/从/前/一/交易/日/的/[rat１比１·７７６６]/降/至/[rat１比１·７７６２]/。

（20c）/从/前/不久/[L深圳]/一家/公司/大规模/地/恶意/抢/注/商标/案/，

（21）以为：作动词时不切分。

（21a）/以为/强大/的/[P卡斯珀罗夫]/恢复/了/他/的/本来面目/。

（21b）有/一些/干部/想/不/通/，/以为/是/搞/形式/，/出风头/。

（21c）/我们/引/以/为/自豪/的/风格/多少/应/有些/改变/了/。

（21d）/代表/们/以/为/人民/高度/负责/的/精神/，/提出/批评/和/意见/。

（22）正当：作形容词时不切分。

（22a）/我们/是否/能/以/某/种/不/正当/的/方式/反对/，

（22b）/正/当/禾苗/生长/关键/时期/，/

（23）正在：

（23a）/对/各地/已/建成/尚未/售出/和/正在/建设/的/住房/，

（23b）/记者/正/在/回/[L巴黎]/的/高速/列车/上/。

（23c）/[O世界卫生组织]/正/在/[L科特迪瓦]/召开/国际/会议/，

（24）会上：

（24a）/[L苏州]/等/省市/及/有关单位/在/会上/介绍/了/经验/，

（24b）/在/[O国际泳联]/[dat二十四日]/举行/的/听证/会/上/，

（25）台上：

（25a）/表演/完/节目/后/竟/在/台上/掩/面/而/泣/。/

（25b）/预赛/是/在/有/围/绳/的/拳击/台/上/

（26）走向：用作名词时不切分；用作动词+介词时，一律切分。

（26a）/[L北京]/输/气/管道/工程/线路/走向/示意图/（/示意图/：/[P孙伟]/绘/）/

（26b）/迈出/了/我国/航天/事业/走/向/世界/的/[ord第一步]/。

（27）才能：用作名词时不切分；用作副词+能愿动词时，必须切开。

（27a）/但/如果/施展/才能/的/空间/很/大/，/而且/能/充分/发挥/所/学/专长/，/不妨/一/试/。

（27b）/勤奋/才/能/有/真知灼见/；

（28）人才：用作名词时不切分。

（28a）要/想/成为/[dat跨世纪]/人才/，/光/有/专业知识/不够/，

（28b）/这/恐怕/只有/浪漫/的/[L法国]/人/才/想/得/出来/。

（29）上来：作趋向动词和动词时不切分；当上作方位词时，就要切开。

（29a）/一/届/新/班子/上来/以后/，/

（29b）/把/工作/重点/转移/到/社会主义/现代化/建设/上/来/，/

(30)上去：作趋向动词和动词时不切分；当上作方位词时，就要切开。

（30a）/显然/是/上/了/学/的/[L瑶族]/娃子/写/上去/的/。/

（30b）/把/科研/技术/成果/转移/到/社会/应用/上/去/。/

（31）上前：

（31a）/他/的/四个/弟兄/挨次/伸出/手/来/上前/祝贺/

（31b）/当即/冲/上/前/去/，/扭/住/一/名/歹徒/不/放/，/

（32）上路：作动词时不切分；上作动词是要切分。

（32a）/我/背/起/你/的/薄被/送/你/上路/，

（32b）/过去/村里/也是/上/路/打场/，/

（33）得了：取助词用法时不切分；但作为动词+助词（了）和可能式动补结构（见14）时要切分。

（33a）/没/叫/到/你/的/时候/，/安心/等/着/就/得了/。

（33b）/经/医生/诊断/他/得/了/胃癌/。

（33c）/书记/何以/承受/得/了/，

（34）得出：作动词时不切；作可能式动补结构（见14）时要切分。

（34a）[L天津]/近几年/的/实践/得出/了/肯定/的/答案/。

（34b）为了/让/[dat今年]/蒜农/的/产品/卖/得/出/、/卖/出/好/价钱/，/

（35）人称：作名词时不切分；当称作动词是要切分。

（35a）作者/用/第一/人称/的/叙述/手法/，

（35b）据/用/过/的/人/称/，/打/国际/长途/如/从/[L北京]/到/[L美国]/，/每/分钟/只需/传统/电话/费用/的/[fra１／４]/。

（36）同行：用作名词时不切分，读作tonghang；用作动词时读作tongxing，一律切分。

（36a）/这时/恰/有/同行/到来/，/只好/借/[mon一元钱]/给/他/。

（36b）/笔者/与/她/骑车/同/行/。

（37）从小：

（37a）/图文并茂/、/声/形/兼备/的/写作/能力/将要/从小/培养/，

（37b）/企业/从无到有/，/从/小/到/大/，

（38）中学：

（38a）/在/长期/的/中学/教学/实践/中/我/体会/到/，

（38b）/引导/他们/在/实践/中/学/会/正确/行使/民主/权利/。

（39）上门：

（39a）/营业员/们/便/主动/上门/收款/。

（39b）/打/出/了/名气/，/找/上/门/来/的/工程/一个/接/一个/。

（39c）我/冲/出/门/去/，/随手/拉/上/门/。/

（40）声响：作名词使用时不切分。

（40a）/而/轻轻/地/挪动/椅子/走开/，/无/一点/声响/。/

（40b）/"/哗哗/"/的/潮水/声/响/成/一片/，

（41）就此：作副词使用时不切分。

（41a）/国际/足球/界/一些/有识之士/就此/产生/一种/忧虑/，

（41b）我们/也/欢迎/科技界/人士/就/此/问题/发表/意见/和/建议/。

（42）高层次作形容词时不切分；当该词前有副词修饰时需切分。

（42a）/着眼点/放/在/培养/造就/大批/高层次/科技/人才/上/。

（42b）/实现/更/大/规模/、/更/高/层次/的/扩张/和/发展/。/

（43）有的：

（43a）/有的/用/汉语/，/有的/用/俄语/，

（43b）/是/[L北大荒]/独/有/的/风味/。

（44）的话：作助词使用时不切。

（44a）/如果/要/使/谈判/取得/迅速/进展/的话/，

（44b）/[P卡比拉]/先生/对/我/的/话/是/持/认真/态度/的/。

（45）话说：整体作动词使用时不切。

（45a）/话说/当年/，/他/言语/铿锵/：/"/在/当时/，/一切/都/得/打破常规/。

（45b）用/他/自己/的/话/说/，

（46）标本：意思为生物/标本时不切；表示"直接和根本"并列的意思时要切开。

（46a）/他们/还/结合/挂图/、/标本/进行/讲解/。

（46b）/反/腐败/要/坚持/标/本/兼/治/，

（47）上将：作为军衔使用时不切分。

（47a）/[O中央军委]/副/主席/、/国务委员/兼/[O国防部]/长/[P迟浩田]/上将/

（47b）/[L中国]/在/人口/问题/上/将/面临/新/的/挑战/。

（48）将军：作为军衔使用时不切分。

（48a）/党/和/国家/领导人/、/解放军/元帅/、/将军/、/政府/省/部级/干部

（48b）/将/军：将/军/体/与/群体/紧密/结合/，/开办/体育/知识/讲座

（49）之一：

（49a）/企业/领导班子/不/适应/社会主义/市场经济/的/要求/是/主要/原因/之一/。

（49b）/游人/视线/随/之/一/收/，/"/[L太和宫]/"/[int三个]/大字/豁然/在/目/。/

（49c）/我/不禁/为/之/一/震/。

（50）到家：作为形容词不切分。

（50a）/现在/不行/，/你/技术/不/过关/，/说明/练/得/还/不/到家/，/

（50b）/果不其然/，/此/儿/到/家/就/猝不及防/地/给/了/他/妈/一/刀/。

（50c）/[P赵匡胤]/终于/将/义/妹/[P京娘]/送/到/家/。

（51）在家：

（51a）/一直/在家/等待/厂子/通知/上班/的/她/再/也/沉/不住/气/了/

（51b）/实现/访客/在/家/门口/与/住户/可/视/

（51c）/把/她/一/人/放/在/家/中/[P孙威锋]/放心不下/，

（52）人均：

（52a）/学生/拥有/计算机/的/人均/占有率/最高/

（52b）/[int两]/人/均/未/达到/[fra２／３]/的/当选/票/数/，

（53）中用：

（53a）/"/察/古/知/今/"/基本上/不/中用/了/，

（53b）/天文学/上/把/[L宇宙]/中/用/光学/方法/看/不/到/的/物质/称/做/暗/物质/，

（53c）/西/体/[L中]/用/我/也/反对/，

（54）前去：

（54a）/让/[P胡洁青]/前去/扶持/、/帮忙/。

（54b）一/名/应邀/到会/的/[L北京]/小学生/激动/地/跑/上/前/去/请/他/签名/。

（55）词表词"受过"只有"代人受过"的意思。当动词受和助词过构成"动+助"结构时，一律切开。

（55a）/它们/代/四/奸/受过/，

（55b）/[P鲁迅]/虽然/在/[dat二十年代中期]/受/过/[P托洛茨基]/的/一定/影响/，

（56）结果：有名词和动词两种用法，都不切分，动词结果的意思是杀死，而不是结出果实的意思。作为后一个意思，名词果是动词结的宾语，所以需切分。

（56a）/矫枉过正/的/结果/，/是/大家/几乎/忘/了/怎么/吃/，/

（56b）/种/果树/一般/要/三年/才/能/结/果/，

#### 9.2.4就是、只有、只是、还是的切分规则

##### 9.2.4.1就是

就是作副词、连词、助词使用时不切分。但作动词时，就是副词，是是动词，一律切分。

(A)作副词时，共有6个义项：

（i）单用，表示同意，对；

(ii)表示坚决，不可更改；

(iii)强调肯定某种性质和状态，含有反驳意味；(iv)强调迅速果断；

(v)确定范围，排除其它；(vi)表示没有别的情况；。

（1）我/一定/办到/，您/放心/就是/。/

（2）/反正/姥爷/就是/看/我/不/顺心/，/一点/也/不/喜欢/我/。/

（3）/望/着/车/来/车/往/的/马路/，/一/站/就是/[int几个小时]/。/

（4）/就是/节目/诉/求/为/非常/鲜明/的/单一/主题/，

(B)作连词有2个义项：

(i)表示假设的让步；即使(后面常用也作呼应)；

(ii)表示一种极端情况；纵然。如：

(5)不是/播种/，/就是/锄地/；/不是/下/田/挖/野菜/，/就是/上山/打柴/。

(6)这个/建议/好/倒/是/好/，/就是/远水/不解/近/渴/。/

(C)就是作动词时，一律切分，就是副词，是是动词。如：

（7）/[O光华国中]/职员/[P杨一中]/就/是/买/菜/变成/[O慈德]/会员/的/一/例/。

（8）/多元化/的/意思/就/是/有/了/更/多/的/选择/，

（9）/最/特别/的/就/是/黄金/压制/的/邮票/。

（10）/最/关键/的/原则/就/是/「/避/凶/趋/吉/」/，

（11）这/就/是/[L海尔-波普彗星]/。/

##### 9.2.4.2只有

只有作为一个词表词有副词和连词两个义项。当他用作动词时，一律切开。（A）只有做副词相当于只好，表示唯一的选择。如：

（1）/家属/最后/只有/寄/望/对岸/[O海协会]/能/请/[L大陆]/渔船/协/寻/。

（2）/[L中国]/的/体育/长期/是/国家/一/家/办/，/发达国家/是/国家/不/办/，/只有/社会/办/，/现在/国际/体育/的/潮流/是/国家/与/社会/共同/兴办/。/

（3）无/雪/的/[dat冬天]/是/难挨/的/，/我/只有/在/心中/落/着/一/场/场/大雪/。/

（4）协办员/和/见习员/在/通过/[int三道]/关/后/，/还要/经由/主办员/挑选/，/没有/主办员/

挑选/的/也/只有/待岗/。/

（5）如果/"/邪恶/的/敌人/对/[L伊]/发动/侵略/，/[L伊拉克]/将/别无选择/，/只有/用/其/全部/的/潜力/、/经验/和/信仰/进行/自卫/"/。/

(B)"只有"作连词用表示必要条件，下文常用副词才、方呼应。如：

（6）只有/掌握/了/最/先进/的/科学/，/我们/才/能/有/巩固/的/国防/。/

（7）高尚/的/世界/只/对/高尚/的/人们/存在/，/高尚/的/精神/境界/只有/高尚/的/人们/才/有/

资格/领略/。/

（C）"只有"用作动词时一律切开。这时"只"做副词、"有"作动词。如：

（8）/完成/管理/的/比率/只/有/[per百分之八十九]/，

（9）/车行/时速/只/有/[len卅到五十公里]/左右/；

（10）/目前/[O基隆邮局]/只/有/一个/集邮/柜台/，

（11）/因/[O中嵙国小]/整个/学区/只/有/一个/[L中嵙里]/，

##### 9.2.4.3还是

还是有三种用法：连词、副词和动词。作动词时一律切分。

（A）作连词用时表示选择，通常跟无论、不管等连用。带连词还是的句子，除疑问句外，还是都可以换成或者，意思不变。例如：

（1）无论是/说/新/话/，/提/新/观点/，/还是/放弃/前人/和/本本/上/的/过时/的/观点/、/错误/的/结论/，/都/需要/勇气/。/

（2）农民/[P张戎梅]/说/：/"/我们/村/不论/是/养猪/还是/种菜/的/，/现在/都/把/眼睛/盯/在/了/铁路/两头/。/"/

（3）不管/是/开工/还是/竣工/，/既/有/庆典/，/又/有/报导/，/或/称/世纪/工程/，

（4）他们/不但/是/我们/公司/发展/的/"/动力/之/源/"/，/还是/我们/学习/的/好榜样/！/

（B）还是作副词用时有三个义项：

（i）表示行为、动作或状态继续保持不变，相当于"仍然"、"依然"。

（ii）表示经过比较后做出的选择。

（iii）加强语气，相当于"到底"、"究竟"、"毕竟"。

还是/d用在动词、形容词前，可以省作还，而用在主语前不能省作还。

（5）/但/现场/交通/还是/十分/杂乱/。

（6）/该/基金/还是/可以/支应/灾民/最高/[mon一百万元]/的/贷款/额/，

（7）/很多/居民/还是/使用/地下水/，

（8）/[P陈]/还是/不/改/顽皮/个性/，

（9）/多数人/还是/喜欢/为/宝宝/选/个/金/饰/，

（10）/[P陈小弟]/的/父/母亲/还是/勇敢/地/生/下/他/，

(C)还是用作动词时一律都要切开，即还作副词使用，有作动词使用。句型"是/v……的"可以帮助我们判断还是在句中是不是一种动词的用法。

（11）/关键/还/是/在/府/会/双方/态度/，

（12）/初/到/部队/，/[age十五六岁]/，/还/是/个/没/见/过/世面/的/毛孩子/。/

（13）/她/不/相信/歌剧/这/门/综合/艺术/会/落入/低谷/，/认为/关键/还/是/提高/歌剧/自身/的

/品质/。/

（14）但/在/日常/工作/中/，/我/深感/除了/忙/还/是/忙/，/搞/得/焦头烂额/，/一天到晚/自己/不/属于/自己/。/

##### 9.2.4.4只是

只是有三种用法：副词、连词和动词。作动词时统统切分。

（A）只是作副词使用时有两个义项：

(i)表示限定某种情况或范围，相当于仅仅是。句末用而已或罢了等配合，

表示语气更为缓和。

(ii)强调在任何条件下情况都不变，有总是的意思。

（1）/只是/作为/预定/分娩/日/的/参考/。

（2）/只是/没有/焢/窑/经验/的/[P张]/课/长/，

（3）/他/虽/表示/民意调查/结果/数据/只是/具有/参考/价值/，

（4）/施工/初期/只是/修剪/树枝/，

（B）只是作连词用，用在后一分句，表示轻微的转折，补充修正上文的意思，与不过的用法相近。

（5）/记者/在/重灾区/[L大河乡]/注意/到/，/群众/有/饭/吃/，/有/衣/穿/，/有/伤病/能/医治/，/只是/搭建/的/小/窝棚/难以/抵御/坝上/呼啸/的/寒风/。/

（6）[dat唐朝]/著名/诗人/[P李商隐]/『/夕阳/无限/好/，/只是/近/黄昏/』/的/诗句/是/对/黄昏/的/叹息/和/无奈/，/

(C)只是用作动词时一律要切开，即只作副词，是作动词。如：

（7）/他/只/是/[dur一个月]/领/[mon二万多元]/的/工人/

（8）/事实上/[L盐埔乡]/公所/的/薪水/无/着落/只/是/冰山/一/角/，

（9）/这些/需求/不只/是/钱/或/资源/，/


================================================
FILE: docs/api/common/configurable.rst
================================================
.. _api/configurable:

configurable
====================


.. autoclass:: hanlp_common.configurable.Configurable
	:members:

.. autoclass:: hanlp_common.configurable.AutoConfigurable
	:members:


================================================
FILE: docs/api/common/conll.rst
================================================
.. _api/conll:

conll
====================


.. autoclass:: hanlp_common.conll.CoNLLWord
	:members:

.. autoclass:: hanlp_common.conll.CoNLLUWord
	:members:

.. autoclass:: hanlp_common.conll.CoNLLSentence
	:members:

================================================
FILE: docs/api/common/constant.rst
================================================
constant
====================


.. automodule:: hanlp_common.constant
	:members:


================================================
FILE: docs/api/common/document.rst
================================================
.. _api/document:

document
====================

.. currentmodule:: hanlp_common

.. autoclass:: hanlp_common.document.Document
	:members:


================================================
FILE: docs/api/common/index.md
================================================
# hanlp_common

Common APIs shared between `hanlp` and `restful`.

```{toctree}
document
conll
configurable
constant
```


================================================
FILE: docs/api/hanlp/common/component.rst
================================================
component
=================

.. currentmodule:: hanlp.common

.. autoclass:: hanlp.common.component.Component
	:members:


================================================
FILE: docs/api/hanlp/common/dataset.md
================================================
# dataset

This module provides base definition for datasets, dataloaders and samplers.

## datasets

```{eval-rst}
.. currentmodule:: hanlp.common

.. autoclass:: hanlp.common.dataset.Transformable
	:members:

.. autoclass:: hanlp.common.dataset.TransformableDataset
	:members:
	:special-members:
	:exclude-members: __init__, __repr__
```

## dataloaders

```{eval-rst}
.. currentmodule:: hanlp.common

.. autoclass:: hanlp.common.dataset.PadSequenceDataLoader
	:members:
	:special-members:
	:exclude-members: __init__, __repr__

.. autoclass:: hanlp.common.dataset.PrefetchDataLoader
	:members:
	:special-members:
	:exclude-members: __init__, __repr__
```

## samplers

```{eval-rst}
.. currentmodule:: hanlp.common

.. autoclass:: hanlp.common.dataset.BucketSampler
	:members:

.. autoclass:: hanlp.common.dataset.KMeansSampler
	:members:

.. autoclass:: hanlp.common.dataset.SortingSampler
	:members:
```

## sampler builders

```{eval-rst}
.. currentmodule:: hanlp.common

.. autoclass:: hanlp.common.dataset.SamplerBuilder
	:members:

.. autoclass:: hanlp.common.dataset.SortingSamplerBuilder
	:members:

.. autoclass:: hanlp.common.dataset.KMeansSamplerBuilder
	:members:

```

================================================
FILE: docs/api/hanlp/common/index.md
================================================
# common

Common base classes.

```{toctree}
structure
vocab
transform
dataset
component
torch_component
```


================================================
FILE: docs/api/hanlp/common/structure.md
================================================
# structure

```{eval-rst}
.. currentmodule:: hanlp.common

.. autoclass:: hanlp.common.structure.ConfigTracker
	:members:

.. autoclass:: hanlp.common.structure.History
	:members:

```


================================================
FILE: docs/api/hanlp/common/torch_component.md
================================================
# torch_component

```{eval-rst}
.. currentmodule:: hanlp.common.torch_component

.. autoclass:: hanlp.common.torch_component.TorchComponent
	:members:

```


================================================
FILE: docs/api/hanlp/common/transform.md
================================================
# transform

```{eval-rst}
.. currentmodule:: hanlp.common

.. autoclass:: hanlp.common.transform.VocabDict
	:members:

```


================================================
FILE: docs/api/hanlp/common/vocab.md
================================================
# vocab

```{eval-rst}
.. currentmodule:: hanlp.common

.. autoclass:: hanlp.common.transform.Vocab
	:members:
	:special-members:
	:exclude-members: __init__, __repr__, __call__, __str__

```


================================================
FILE: docs/api/hanlp/components/classifiers.md
================================================
# classifiers

```{eval-rst}
.. currentmodule:: hanlp.components.classifiers

.. autoclass:: hanlp.components.classifiers.transformer_classifier.TransformerClassifier
	:members:

```


================================================
FILE: docs/api/hanlp/components/eos.md
================================================
# eos

```{eval-rst}
.. currentmodule:: hanlp.components.eos

.. autoclass:: hanlp.components.eos.ngram.NgramSentenceBoundaryDetector
	:members:

```


================================================
FILE: docs/api/hanlp/components/index.md
================================================
# components

NLP components.

```{toctree}
mtl/index
classifiers
eos
tokenizers/index
lemmatizer
taggers/index
ner/index
parsers/index
srl/index
pipeline
sts
```


================================================
FILE: docs/api/hanlp/components/lemmatizer.md
================================================
# lemmatizer

```{eval-rst}
.. currentmodule:: hanlp.components.lemmatizer

.. autoclass:: TransformerLemmatizer
	:members:

```


================================================
FILE: docs/api/hanlp/components/mtl/index.md
================================================
# mtl

Multi-Task Learning (MTL) framework.

```{toctree}
mtl
tasks/index
```


================================================
FILE: docs/api/hanlp/components/mtl/mtl.md
================================================
# MultiTaskLearning

```{eval-rst}
.. currentmodule:: hanlp.components.mtl

.. autoclass:: hanlp.components.mtl.multi_task_learning.MultiTaskLearning
	:members:
	:special-members:
	:exclude-members: __init__, __repr__

```


================================================
FILE: docs/api/hanlp/components/mtl/tasks/constituency.md
================================================
# con

Constituency parsing.

```{eval-rst}
.. currentmodule:: hanlp.components.mtl

.. autoclass:: hanlp.components.mtl.tasks.constituency.CRFConstituencyParsing
	:members:
	:exclude-members: execute_training_loop, fit_dataloader

```


================================================
FILE: docs/api/hanlp/components/mtl/tasks/dep.md
================================================
# dep

Dependency parsing.

```{eval-rst}
.. currentmodule:: hanlp.components.mtl

.. autoclass:: hanlp.components.mtl.tasks.dep.BiaffineDependencyParsing
	:members:
	:exclude-members: execute_training_loop, fit_dataloader

```


================================================
FILE: docs/api/hanlp/components/mtl/tasks/index.md
================================================
# tasks

Multi-Task Learning (MTL) tasks.

```{toctree}
task
constituency
dep
sdp
ud
lem
pos
tok
ner/index
srl/index
```


================================================
FILE: docs/api/hanlp/components/mtl/tasks/lem.md
================================================
# lem

Lemmatization.

```{eval-rst}
.. currentmodule:: hanlp.components.mtl

.. autoclass:: hanlp.components.mtl.tasks.lem.TransformerLemmatization
	:members:
	:exclude-members: execute_training_loop, fit_dataloader

```


================================================
FILE: docs/api/hanlp/components/mtl/tasks/ner/biaffine_ner.md
================================================
# biaffine_ner

Biaffine Named Entity Recognition.

```{eval-rst}
.. currentmodule:: hanlp.components.mtl

.. autoclass:: hanlp.components.mtl.tasks.ner.biaffine_ner.BiaffineNamedEntityRecognition
	:members:
	:exclude-members: execute_training_loop, fit_dataloader

```


================================================
FILE: docs/api/hanlp/components/mtl/tasks/ner/index.md
================================================
# ner

Named Entity Recognition.

```{toctree}
tag_ner
biaffine_ner
```


================================================
FILE: docs/api/hanlp/components/mtl/tasks/ner/tag_ner.md
================================================
# tag_ner

Tagging based Named Entity Recognition.

```{eval-rst}
.. currentmodule:: hanlp.components.mtl

.. autoclass:: hanlp.components.mtl.tasks.ner.tag_ner.TaggingNamedEntityRecognition
	:members:
	:exclude-members: execute_training_loop, fit_dataloader

```


================================================
FILE: docs/api/hanlp/components/mtl/tasks/pos.md
================================================
# pos

Part-of-speech tagging.

```{eval-rst}
.. currentmodule:: hanlp.components.mtl

.. autoclass:: hanlp.components.mtl.tasks.pos.TransformerTagging
	:members:
	:exclude-members: execute_training_loop, fit_dataloader

```


================================================
FILE: docs/api/hanlp/components/mtl/tasks/sdp.md
================================================
# sdp

Semantic Dependency Parsing.

```{eval-rst}
.. currentmodule:: hanlp.components.mtl

.. autoclass:: hanlp.components.mtl.tasks.sdp.BiaffineSemanticDependencyParsing
	:members:
	:exclude-members: execute_training_loop, fit_dataloader

```


================================================
FILE: docs/api/hanlp/components/mtl/tasks/srl/bio_srl.md
================================================
# bio_srl

BIO Tagging based Semantic Role Labeling.

```{eval-rst}
.. currentmodule:: hanlp.components.mtl

.. autoclass:: hanlp.components.mtl.tasks.srl.bio_srl.SpanBIOSemanticRoleLabeling
	:members:
	:exclude-members: execute_training_loop, fit_dataloader

```


================================================
FILE: docs/api/hanlp/components/mtl/tasks/srl/index.md
================================================
# srl

Semantic Role Labeling.

```{toctree}
bio_srl
rank_srl
```


================================================
FILE: docs/api/hanlp/components/mtl/tasks/srl/rank_srl.md
================================================
# rank_srl

Span Ranking Semantic Role Labeling.

```{eval-rst}
.. currentmodule:: hanlp.components.mtl

.. autoclass:: hanlp.components.mtl.tasks.srl.rank_srl.SpanRankingSemanticRoleLabeling
	:members:
	:exclude-members: execute_training_loop, fit_dataloader

```


================================================
FILE: docs/api/hanlp/components/mtl/tasks/task.md
================================================
# Task

```{eval-rst}
.. currentmodule:: hanlp.components.mtl

.. autoclass:: hanlp.components.mtl.tasks.Task
	:members:
	:exclude-members: execute_training_loop, fit_dataloader

```


================================================
FILE: docs/api/hanlp/components/mtl/tasks/tok.md
================================================
# tok

Tokenization.

```{eval-rst}
.. currentmodule:: hanlp.components.mtl

.. autoclass:: hanlp.components.mtl.tasks.tok.tag_tok.TaggingTokenization
	:members:
	:exclude-members: execute_training_loop, fit_dataloader

```


================================================
FILE: docs/api/hanlp/components/mtl/tasks/ud.md
================================================
# ud

Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing).

```{eval-rst}
.. currentmodule:: hanlp.components.mtl

.. autoclass:: hanlp.components.mtl.tasks.ud.UniversalDependenciesParsing
	:members:
	:exclude-members: execute_training_loop, fit_dataloader

```


================================================
FILE: docs/api/hanlp/components/ner/biaffine_ner.md
================================================
# biaffine_ner

Biaffine Named Entity Recognition.

```{eval-rst}
.. currentmodule:: hanlp.components.ner.transformer_ner

.. autoclass:: hanlp.components.ner.biaffine_ner.biaffine_ner.BiaffineNamedEntityRecognizer
	:members:

```


================================================
FILE: docs/api/hanlp/components/ner/index.md
================================================
# ner

Named Entity Recognition.

```{toctree}
transformer_ner
rnn_ner
biaffine_ner
```


================================================
FILE: docs/api/hanlp/components/ner/rnn_ner.md
================================================
# rnn_ner

Tagging based Named Entity Recognition.

```{eval-rst}
.. currentmodule:: hanlp.components.ner.rnn_ner

.. autoclass:: hanlp.components.ner.rnn_ner.RNNNamedEntityRecognizer
	:members:

```


================================================
FILE: docs/api/hanlp/components/ner/transformer_ner.md
================================================
# transformer_ner

Tagging based Named Entity Recognition.

```{eval-rst}
.. currentmodule:: hanlp.components.ner.transformer_ner

.. autoclass:: hanlp.components.ner.transformer_ner.TransformerNamedEntityRecognizer
	:members:

```


================================================
FILE: docs/api/hanlp/components/parsers/biaffine_dep.md
================================================
# biaffine_dep

Biaffine dependency parser.

```{eval-rst}
.. currentmodule:: hanlp.components

.. autoclass:: hanlp.components.parsers.biaffine.biaffine_dep.BiaffineDependencyParser
	:members:

```


================================================
FILE: docs/api/hanlp/components/parsers/biaffine_sdp.md
================================================
# biaffine_sdp

Biaffine dependency parser.

```{eval-rst}
.. currentmodule:: hanlp.components

.. autoclass:: hanlp.components.parsers.biaffine.biaffine_sdp.BiaffineSemanticDependencyParser
	:members:

```


================================================
FILE: docs/api/hanlp/components/parsers/crf_constituency_parser.md
================================================
# crf_constituency_parser

Biaffine dependency parser.

```{eval-rst}
.. currentmodule:: hanlp.components

.. autoclass:: hanlp.components.parsers.constituency.crf_constituency_parser.CRFConstituencyParser
	:members:

```


================================================
FILE: docs/api/hanlp/components/parsers/index.md
================================================
# parsers

Parsers.

```{toctree}
biaffine_dep
biaffine_sdp
ud_parser
crf_constituency_parser
```


================================================
FILE: docs/api/hanlp/components/parsers/ud_parser.md
================================================
# ud_parser

Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing).

```{eval-rst}
.. currentmodule:: hanlp.components

.. autoclass:: hanlp.components.parsers.ud.ud_parser.UniversalDependenciesParser
	:members:

```


================================================
FILE: docs/api/hanlp/components/pipeline.md
================================================
# pipeline

```{eval-rst}
.. currentmodule:: hanlp.components.pipeline

.. autoclass:: hanlp.components.pipeline.Pipe
	:members:
	
.. autoclass:: hanlp.components.pipeline.Pipeline
	:members:

```


================================================
FILE: docs/api/hanlp/components/srl/index.md
================================================
# srl

Semantic Role Labelers.

```{toctree}
span_rank
span_bio
```


================================================
FILE: docs/api/hanlp/components/srl/span_bio.md
================================================
# span_bio

Span BIO tagging based SRL.

```{eval-rst}
.. currentmodule:: hanlp.components.srl.span_bio.span_bio

.. autoclass:: SpanBIOSemanticRoleLabeler
	:members:

```


================================================
FILE: docs/api/hanlp/components/srl/span_rank.md
================================================
# span_rank

Span Rank based SRL.

```{eval-rst}
.. currentmodule:: hanlp.components.srl.span_rank.span_rank

.. autoclass:: SpanRankingSemanticRoleLabeler
	:members:

```


================================================
FILE: docs/api/hanlp/components/sts.md
================================================
# sts

```{eval-rst}
.. currentmodule:: hanlp.components.sts

.. autoclass:: hanlp.components.sts.transformer_sts.TransformerSemanticTextualSimilarity
	:members:

```


================================================
FILE: docs/api/hanlp/components/taggers/index.md
================================================
# taggers

Taggers.

```{toctree}
transformer_tagger
rnn_tagger
```


================================================
FILE: docs/api/hanlp/components/taggers/rnn_tagger.md
================================================
# rnn_tagger

RNN based tagger.

```{eval-rst}
.. currentmodule:: hanlp.components

.. autoclass:: hanlp.components.taggers.rnn_tagger.RNNTagger
	:members:

```


================================================
FILE: docs/api/hanlp/components/taggers/transformer_tagger.md
================================================
# transformer_tagger

Transformer based tagger.

```{eval-rst}
.. currentmodule:: hanlp.components

.. autoclass:: hanlp.components.taggers.transformers.transformer_tagger.TransformerTagger
	:members:

```


================================================
FILE: docs/api/hanlp/components/tokenizers/index.md
================================================
# tokenizers

Tokenizers.

```{toctree}
transformer
multi_criteria
```


================================================
FILE: docs/api/hanlp/components/tokenizers/multi_criteria.md
================================================
# multi_criteria

Transformer based Multi-Criteria Word tokenizer.

```{eval-rst}
.. currentmodule:: hanlp.components.tokenizers.multi_criteria_cws_transformer

.. autoclass:: hanlp.components.tokenizers.multi_criteria_cws_transformer.MultiCriteriaTransformerTaggingTokenizer
	:members:

```


================================================
FILE: docs/api/hanlp/components/tokenizers/transformer.md
================================================
# transformer

Transformer based tokenizer.

```{eval-rst}
.. currentmodule:: hanlp.components.tokenizers.transformer

.. autoclass:: hanlp.components.tokenizers.transformer.TransformerTaggingTokenizer
	:members:

```


================================================
FILE: docs/api/hanlp/datasets/constituency/constituency_dataset.md
================================================
# constituency_dataset

```{eval-rst}

.. autoclass:: hanlp.datasets.parsing.loaders.constituency_dataset.ConstituencyDataset
	:members:

```


================================================
FILE: docs/api/hanlp/datasets/constituency/index.md
================================================
# con

Constituency parsing datasets.

```{toctree}
constituency_dataset
resources
```


================================================
FILE: docs/api/hanlp/datasets/constituency/resources.md
================================================
# resources

## Chinese Treebank


### CTB8


````{margin} **Discussion**
```{seealso}
About our data split on [our forum](https://bbs.hankcs.com/t/topic/3024).
```
````

```{eval-rst}


.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_BRACKET_LINE_NOEC_TRAIN
.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_BRACKET_LINE_NOEC_DEV
.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_BRACKET_LINE_NOEC_TEST

```

### CTB9

````{margin} **Discussion**
```{seealso}
About our data split on [our forum](https://bbs.hankcs.com/t/topic/3024).
```
````

```{eval-rst}


.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_BRACKET_LINE_NOEC_TRAIN
.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_BRACKET_LINE_NOEC_DEV
.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_BRACKET_LINE_NOEC_TEST

```

## English Treebank

### PTB

```{eval-rst}

.. autodata:: hanlp.datasets.parsing.ptb.PTB_TRAIN
.. autodata:: hanlp.datasets.parsing.ptb.PTB_DEV
.. autodata:: hanlp.datasets.parsing.ptb.PTB_TEST

```


================================================
FILE: docs/api/hanlp/datasets/dep/conll_dataset.md
================================================
# conll

```{eval-rst}
.. currentmodule:: hanlp.datasets.parsing.loaders.conll_dataset 


.. autoclass:: CoNLLParsingDataset
	:members:

```


================================================
FILE: docs/api/hanlp/datasets/dep/index.md
================================================
# dep

Dependency parsing datasets.

```{toctree}
conll_dataset
resources
```


================================================
FILE: docs/api/hanlp/datasets/dep/resources.md
================================================
# resources

## PKU Multiview Treebank

PKU Multi-view Chinese Treebank, released by PKU-ICL. It contains the sentences from People's Daily(19980101-19980110).
The number of sentences in it is 14463.

```{eval-rst}

.. automodule:: hanlp.datasets.parsing.pmt1
    :members:

```

## Chinese Treebank

### CTB5

```{eval-rst}

.. automodule:: hanlp.datasets.parsing.ctb5
    :members:

```

### CTB7

```{eval-rst}

.. automodule:: hanlp.datasets.parsing.ctb7
    :members:

```

### CTB8

```{eval-rst}

.. Attention::

    We propose a new data split for CTB which is different from the academia conventions with the following 3 advantages.
    
    - Easy to reproduce. Files ending with ``8`` go to dev set, ending with ``9`` go to the test set, otherwise go to the training set.
    - Full use of CTB8. The academia conventional split omits 50 gold files while we recall them.
    - More balanced split across genres. Proportions of samples in each genres are similar.
    
    We also use Stanford Dependencies 3.3.0 which offers fine-grained relations and more grammars than the conventional
    head finding rules introduced by :cite:`zhang-clark-2008-tale`.
    
    Therefore, scores on our preprocessed CTB8 are not directly comparable to those in most literatures. We have 
    experimented the same model on the conventionally baked CTB8 and the scores could be 4~5 points higher. 
    We believe it's worthy since HanLP is made for practical purposes, not just for producing pretty numbers.
    
```

````{margin} **Discussion**
```{seealso}
We have a discussion on [our forum](https://bbs.hankcs.com/t/topic/3024).
```
````

```{eval-rst}


.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_SD330_TRAIN
.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_SD330_DEV
.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_SD330_TEST

```

### CTB9

```{eval-rst}

.. Attention::

    Similar preprocessing and splits with CTB8 are applied. See the notice above.
    
```

```{eval-rst}


.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_SD330_TRAIN
.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_SD330_DEV
.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_SD330_TEST

```

## English Treebank

### PTB

```{eval-rst}

.. autodata:: hanlp.datasets.parsing.ptb.PTB_SD330_TRAIN
.. autodata:: hanlp.datasets.parsing.ptb.PTB_SD330_DEV
.. autodata:: hanlp.datasets.parsing.ptb.PTB_SD330_TEST

```

## Universal Dependencies

### Languages

```{eval-rst}

.. automodule:: hanlp.datasets.parsing.ud.ud27
    :members:

```

### Multilingual

```{eval-rst}

.. automodule:: hanlp.datasets.parsing.ud.ud27m
    :members:

```


================================================
FILE: docs/api/hanlp/datasets/eos/eos.md
================================================
# eos

```{eval-rst}
.. currentmodule:: hanlp.datasets.eos.eos

.. autoclass:: SentenceBoundaryDetectionDataset
	:members:

```


================================================
FILE: docs/api/hanlp/datasets/eos/index.md
================================================
# eos

Sentence boundary detection datasets.

```{toctree}
eos
resources
```


================================================
FILE: docs/api/hanlp/datasets/eos/resources.md
================================================
# resources

## nn_eos

```{eval-rst}

.. automodule:: hanlp.datasets.eos.loaders.nn_eos
    :members:

```

================================================
FILE: docs/api/hanlp/datasets/index.md
================================================
# datasets

```{eval-rst}
NLP datasets grouped by tasks. For each task, we provide at least one ``torch.utils.data.Dataset`` compatible class
and several open-source resources. Their file format and description can be found in their ``Dataset.load_file`` 
documents. Their contents are split into ``TRAIN``, ``DEV`` and ``TEST`` portions, each of them is stored in
a Python constant which can be fetched using :meth:`~hanlp.utils.io_util.get_resource`.  
``` 

````{margin} **Professionals use Linux**
```{note}
Many preprocessing scripts written by professionals make heavy use of Linux/Unix tool chains like shell, perl, gcc, 
etc., which is not available or buggy on Windows. You may need a *nix evironment to run these scripts.
```
````

```{toctree}
eos/index
tok/index
pos/index
ner/index
dep/index
srl/index
constituency/index
```


================================================
FILE: docs/api/hanlp/datasets/ner/index.md
================================================
# ner

NER datasets.

```{toctree}
tsv
json
resources
```


================================================
FILE: docs/api/hanlp/datasets/ner/json.md
================================================
# json

```{eval-rst}
.. currentmodule:: hanlp.datasets.ner.loaders.json_ner

.. autoclass:: JsonNERDataset
	:members:

```


================================================
FILE: docs/api/hanlp/datasets/ner/resources.md
================================================
# resources

## CoNLL 2003

```{eval-rst}

.. automodule:: hanlp.datasets.ner.conll03
    :members:

```

## MSRA

```{eval-rst}

.. automodule:: hanlp.datasets.ner.msra
    :members:

```

## OntoNotes5

```{eval-rst}

.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TRAIN
.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_DEV
.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TEST

.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_NER_CHINESE_TRAIN
.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_NER_CHINESE_DEV
.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_NER_CHINESE_TEST

```

## Resume

```{eval-rst}

.. automodule:: hanlp.datasets.ner.resume
    :members:
```

## Weibo


```{eval-rst}

.. automodule:: hanlp.datasets.ner.weibo
    :members:
```

================================================
FILE: docs/api/hanlp/datasets/ner/tsv.md
================================================
# tsv

```{eval-rst}
.. currentmodule:: hanlp.datasets.ner.loaders.tsv

.. autoclass:: TSVTaggingDataset
	:members:

```


================================================
FILE: docs/api/hanlp/datasets/pos/index.md
================================================
# pos

PoS datasets. 

```{eval-rst}
PoS is a normal tagging task which uses :class:`hanlp.datasets.ner.loaders.tsv.TSVTaggingDataset` for loading.
```

```{toctree}
resources
```


================================================
FILE: docs/api/hanlp/datasets/pos/resources.md
================================================
# resources

## CTB5

```{eval-rst}

.. automodule:: hanlp.datasets.pos.ctb5
    :members:

```

## CTB8

```{eval-rst}

.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_POS_TRAIN
.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_POS_DEV
.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_POS_TEST

```

## CTB9


```{eval-rst}


.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_POS_TRAIN
.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_POS_DEV
.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_POS_TEST

```

================================================
FILE: docs/api/hanlp/datasets/srl/conll2012_dataset.md
================================================
# conll2012_dataset

```{eval-rst}

.. autoclass:: hanlp.datasets.srl.loaders.conll2012.CoNLL2012SRLDataset
	:members:

```


================================================
FILE: docs/api/hanlp/datasets/srl/index.md
================================================
# srl

Semantic Role Labeling datasets.

```{toctree}
conll2012_dataset
resources
```


================================================
FILE: docs/api/hanlp/datasets/srl/resources.md
================================================
# resources

## OntoNotes 5

### Chinese

```{eval-rst}

.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TRAIN
    :noindex:
.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_DEV
    :noindex:
.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TEST
    :noindex:

```


================================================
FILE: docs/api/hanlp/datasets/tok/index.md
================================================
# tok

Tokenization datasets.

```{toctree}
txt
mcws_dataset
resources
```


================================================
FILE: docs/api/hanlp/datasets/tok/mcws_dataset.md
================================================
# mcws_dataset

```{eval-rst}
.. currentmodule:: hanlp.datasets.tokenization.loaders.multi_criteria_cws.mcws_dataset

.. autoclass:: MultiCriteriaTextTokenizingDataset
	:members:

```


================================================
FILE: docs/api/hanlp/datasets/tok/resources.md
================================================
# resources

## sighan2005

[The Second International Chinese Word Segmentation Bakeoff](http://sighan.cs.uchicago.edu/bakeoff2005/) took place over the summer of 2005.

### pku

```{eval-rst}

.. automodule:: hanlp.datasets.tokenization.sighan2005.pku
    :members:

```

### msr

```{eval-rst}

.. automodule:: hanlp.datasets.tokenization.sighan2005.msr
    :members:

```

### as

```{eval-rst}

.. automodule:: hanlp.datasets.tokenization.sighan2005.as_
    :members:

```

### cityu

```{eval-rst}

.. automodule:: hanlp.datasets.tokenization.sighan2005.cityu
    :members:

```

## CTB6

```{eval-rst}

.. automodule:: hanlp.datasets.tokenization.ctb6
    :members:

```

## CTB8


```{eval-rst}

.. automodule:: hanlp.datasets.parsing.ctb8

.. autodata:: CTB8_CWS_TRAIN
.. autodata:: CTB8_CWS_DEV
.. autodata:: CTB8_CWS_TEST

```

## CTB9


```{eval-rst}

.. automodule:: hanlp.datasets.parsing.ctb9

.. autodata:: CTB9_CWS_TRAIN
.. autodata:: CTB9_CWS_DEV
.. autodata:: CTB9_CWS_TEST

```

================================================
FILE: docs/api/hanlp/datasets/tok/txt.md
================================================
# txt

```{eval-rst}
.. currentmodule:: hanlp.datasets.tokenization.loaders.txt

.. autoclass:: TextTokenizingDataset
	:members:

```


================================================
FILE: docs/api/hanlp/hanlp.rst
================================================
.. _api/main:

hanlp
==========

.. currentmodule:: hanlp

.. autofunction:: load

.. autofunction:: pipeline

================================================
FILE: docs/api/hanlp/index.md
================================================
# hanlp

Core APIs for `hanlp`.

```{toctree}
hanlp
common/index
components/index
pretrained/index
datasets/index
utils/index
layers/index
```

================================================
FILE: docs/api/hanlp/layers/decoders/biaffine_ner.md
================================================
# biaffine_ner


```{eval-rst}

.. autoclass:: hanlp.components.ner.biaffine_ner.biaffine_ner_model.BiaffineNamedEntityRecognitionDecoder
	:members:

```


================================================
FILE: docs/api/hanlp/layers/decoders/index.md
================================================
# decoders

```{toctree}
linear_crf
biaffine_ner
```


================================================
FILE: docs/api/hanlp/layers/decoders/linear_crf.md
================================================
# linear_crf


```{eval-rst}

.. autoclass:: hanlp.components.mtl.tasks.pos.LinearCRFDecoder
	:members:

```


================================================
FILE: docs/api/hanlp/layers/embeddings/char_cnn.md
================================================
# char_cnn


```{eval-rst}

.. autoclass:: hanlp.layers.embeddings.char_cnn.CharCNN
	:members:

.. autoclass:: hanlp.layers.embeddings.char_cnn.CharCNNEmbedding
	:members:

```


================================================
FILE: docs/api/hanlp/layers/embeddings/char_rnn.md
================================================
# char_rnn


```{eval-rst}

.. autoclass:: hanlp.layers.embeddings.char_rnn.CharRNN
	:members:

.. autoclass:: hanlp.layers.embeddings.char_rnn.CharRNNEmbedding
	:members:

```


================================================
FILE: docs/api/hanlp/layers/embeddings/embedding.md
================================================
# embedding


```{eval-rst}

.. autoclass:: hanlp.layers.embeddings.embedding.Embedding
	:members:

.. autoclass:: hanlp.layers.embeddings.embedding.ConcatModuleList
	:members:

.. autoclass:: hanlp.layers.embeddings.embedding.EmbeddingList
	:members:

```


================================================
FILE: docs/api/hanlp/layers/embeddings/fasttext.md
================================================
# fasttext

```{eval-rst}

.. autoclass:: hanlp.layers.embeddings.fast_text.FastTextEmbedding
	:members:

.. autoclass:: hanlp.layers.embeddings.fast_text.FastTextEmbeddingModule
	:members:

```


================================================
FILE: docs/api/hanlp/layers/embeddings/index.md
================================================
# embeddings

```{toctree}
embedding
word2vec
fasttext
char_cnn
char_rnn
transformer
```


================================================
FILE: docs/api/hanlp/layers/embeddings/transformer.md
================================================
# transformer


```{eval-rst}

.. autoclass:: hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbedding
	:members:

.. autoclass:: hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbeddingModule
	:members:

```


================================================
FILE: docs/api/hanlp/layers/embeddings/word2vec.md
================================================
# word2vec

```{eval-rst}

.. autoclass:: hanlp.layers.embeddings.word2vec.Word2VecEmbedding
	:members:

.. autoclass:: hanlp.layers.embeddings.word2vec.Word2VecEmbeddingModule
	:members:

```


================================================
FILE: docs/api/hanlp/layers/index.md
================================================
# layers

```{toctree}
embeddings/index
transformers/index
decoders/index
```


================================================
FILE: docs/api/hanlp/layers/transformers/encoder.md
================================================
# encoder


```{eval-rst}

.. autoclass:: hanlp.layers.transformers.encoder.TransformerEncoder
	:members:

```


================================================
FILE: docs/api/hanlp/layers/transformers/index.md
================================================
# transformers

```{toctree}
encoder
tokenizer
```


================================================
FILE: docs/api/hanlp/layers/transformers/tokenizer.md
================================================
# tokenizer


```{eval-rst}

.. autoclass:: hanlp.transform.transformer_tokenizer.TransformerSequenceTokenizer
	:members:

```


================================================
FILE: docs/api/hanlp/pretrained/amr.md
================================================
---
jupytext:
  formats: ipynb,md:myst
  text_representation:
    extension: .md
    format_name: myst
    format_version: '0.8'
    jupytext_version: 1.4.2
kernelspec:
  display_name: Python 3
  language: python
  name: python3
---
# amr

AMR captures “who is doing what to whom” in a sentence. Each sentence is represented as a rooted, directed, acyclic graph with labels on edges (relations) and leaves (concepts).
Before loading an AMR model, make sure to install HanLP with the `amr` dependencies:

```shell
pip install hanlp[amr] -U
```

To parse a raw sentence into AMR:

```{eval-rst}
.. margin:: Batching is Faster

    .. Hint:: Parse multiple sentences at once for faster speed! 
```


```{code-cell} ipython3
:tags: [output_scroll]
import hanlp

amr_parser = hanlp.load(hanlp.pretrained.amr.AMR3_SEQ2SEQ_BART_LARGE)
amr = amr_parser('The boy wants the girl to believe him.')
print(amr)
```

All the pre-trained parsers and their scores are listed below.

```{eval-rst}

.. automodule:: hanlp.pretrained.amr
    :members:

```

================================================
FILE: docs/api/hanlp/pretrained/amr2text.md
================================================
---
jupytext:
  formats: ipynb,md:myst
  text_representation:
    extension: .md
    format_name: myst
    format_version: '0.8'
    jupytext_version: 1.4.2
kernelspec:
  display_name: Python 3
  language: python
  name: python3
---
# amr2text

AMR captures “who is doing what to whom” in a sentence. Each sentence is represented as a rooted, directed, acyclic graph with labels on edges (relations) and leaves (concepts).
The goal of AMR-to-Text Generation is to recover the original sentence realization given an AMR. This task can be seen as the reverse of the structured prediction found in AMR parsing.
Before loading an AMR model, make sure to install HanLP with the `amr` dependencies:

```shell
pip install hanlp[amr] -U
```

To generate a sentence given an AMR:

```{eval-rst}
.. margin:: Batching is Faster

    .. Hint:: Generate multiple sentences at once for faster speed! 
```


```{code-cell} ipython3
:tags: [output_scroll]
import hanlp

generation = hanlp.load(hanlp.pretrained.amr2text.AMR3_GRAPH_PRETRAIN_GENERATION)
print(generation('''
(z0 / want-01
    :ARG0 (z1 / boy)
    :ARG1 (z2 / believe-01
              :ARG0 (z3 / girl)
              :ARG1 z1))
'''))
```

All the pre-trained parsers and their scores are listed below.

```{eval-rst}

.. automodule:: hanlp.pretrained.amr2text
    :members:

```

================================================
FILE: docs/api/hanlp/pretrained/constituency.md
================================================
---
jupytext:
  formats: ipynb,md:myst
  text_representation:
    extension: .md
    format_name: myst
    format_version: '0.8'
    jupytext_version: 1.4.2
kernelspec:
  display_name: Python 3
  language: python
  name: python3
---

# constituency

Constituency Parsing is the process of analyzing the sentences by breaking down it into sub-phrases also known as constituents.

To parse a tokenized sentence into constituency tree, first load a parser:

```{eval-rst}
.. margin:: Batching is Faster

    .. Hint:: To speed up, parse multiple sentences at once, and use a GPU.
```

```{code-cell} ipython3
:tags: [output_scroll]
import hanlp

con = hanlp.load(hanlp.pretrained.constituency.CTB9_CON_FULL_TAG_ELECTRA_SMALL)
```

Then parse a sequence or multiple sequences of tokens to it. 

```{code-cell} ipython3
:tags: [output_scroll]
tree = con(["2021年", "HanLPv2.1", "带来", "最", "先进", "的", "多", "语种", "NLP", "技术", "。"])
```

The constituency tree is a nested list of constituencies:

```{code-cell} ipython3
:tags: [output_scroll]
tree
```

You can `str` or `print` it to get its bracketed form:

```{code-cell} ipython3
:tags: [output_scroll]
print(tree)
```

All the pre-trained parsers and their scores are listed below.

```{eval-rst}

.. automodule:: hanlp.pretrained.constituency
    :members:

```

================================================
FILE: docs/api/hanlp/pretrained/dep.md
================================================
# dep

```{eval-rst}

.. automodule:: hanlp.pretrained.dep
    :members:

```

================================================
FILE: docs/api/hanlp/pretrained/eos.md
================================================
# eos


```{eval-rst}

.. automodule:: hanlp.pretrained.eos
    :members:

```

================================================
FILE: docs/api/hanlp/pretrained/fasttext.md
================================================
# fasttext

```{eval-rst}

.. automodule:: hanlp.pretrained.fasttext
    :members:

```

================================================
FILE: docs/api/hanlp/pretrained/glove.md
================================================
# glove

```{eval-rst}

.. automodule:: hanlp.pretrained.glove
    :members:

```

================================================
FILE: docs/api/hanlp/pretrained/index.md
================================================
# pretrained

```{eval-rst}
NLP components grouped by tasks. For each task, we provide at least one :class:`~hanlp.common.component.Component` 
compatible class and several pretrained models. Each of them is stored in a Python constant which can be fetched 
using :meth:`hanlp.load`.  
``` 

```{toctree}
mtl
eos
tok
pos
ner
dep
constituency
srl
sdp
amr
amr2text
sts
word2vec
glove
fasttext
mlm
```


================================================
FILE: docs/api/hanlp/pretrained/mlm.md
================================================
---
jupytext:
  formats: ipynb,md:myst
  text_representation:
    extension: .md
    format_name: myst
    format_version: '0.8'
    jupytext_version: 1.4.2
kernelspec:
  display_name: Python 3
  language: python
  name: python3
---

# mlm

Masked Language Model (MLM) predicts words that were originally hidden intentionally in a sentence.
To perform such prediction, first load a pre-trained MLM (e.g., `bert-base-chinese`):

````{margin} Batching is Faster
```{hint}
Predict multiple sentences in batch mode for faster speed! 
```
````

````{margin} Multilingual Support
```{note}
HanLP always support multilingual. Feel free to use a multilingual model listed [here](https://huggingface.co/models?pipeline_tag=fill-mask&sort=downloads).
```
````

```{code-cell} ipython3
:tags: [output_scroll]
from hanlp.components.lm.mlm import MaskedLanguageModel
mlm = MaskedLanguageModel()
mlm.load('bert-base-chinese')
```

Represent blanks (masked tokens) with `[MASK]` and let MLM fills them:

```{code-cell} ipython3
:tags: [output_scroll]
mlm('生活的真谛是[MASK]。')
```

Batching is always faster:

```{code-cell} ipython3
:tags: [output_scroll]
mlm(['生活的真谛是[MASK]。', '巴黎是[MASK][MASK]的首都。'])
```


All the pre-trained MLM models and their details are listed in the [docs](https://huggingface.co/models?pipeline_tag=fill-mask&sort=downloads) of Hugging Face 🤗 Transformers.

================================================
FILE: docs/api/hanlp/pretrained/mtl.md
================================================
# mtl

```{eval-rst}

.. automodule:: hanlp.pretrained.mtl
    :members:

```

================================================
FILE: docs/api/hanlp/pretrained/ner.md
================================================
# ner

```{eval-rst}

.. automodule:: hanlp.pretrained.ner
    :members:

```

================================================
FILE: docs/api/hanlp/pretrained/pos.md
================================================
---
jupytext:
  formats: ipynb,md:myst
  text_representation:
    extension: .md
    format_name: myst
    format_version: '0.8'
    jupytext_version: 1.4.2
kernelspec:
  display_name: Python 3
  language: python
  name: python3
---

# pos

The process of classifying words into their **parts of speech** and labeling them accordingly is known as **part-of-speech tagging**, **POS-tagging**, or simply **tagging**. 

To tag a tokenized sentence:

````{margin} Batching is Faster
```{hint}
Tag multiple sentences at once for faster speed! 
```
````


```{code-cell} ipython3
:tags: [output_scroll]
import hanlp

pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
pos(['我', '的', '希望', '是', '希望', '世界', '和平'])
```

````{margin} Custom Dictionary Supported
```{seealso}
See [this tutorial](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/demo_pos_dict.py) for custom dictionary.
```
````

All the pre-trained taggers and their details are listed below.

```{eval-rst}

.. automodule:: hanlp.pretrained.pos
    :members:

```

================================================
FILE: docs/api/hanlp/pretrained/sdp.md
================================================
# sdp

```{eval-rst}

.. automodule:: hanlp.pretrained.sdp
    :members:

```

================================================
FILE: docs/api/hanlp/pretrained/srl.md
================================================
---
jupytext:
  formats: ipynb,md:myst
  text_representation:
    extension: .md
    format_name: myst
    format_version: '0.8'
    jupytext_version: 1.4.2
kernelspec:
  display_name: Python 3
  language: python
  name: python3
---

# srl

Semantic Role Labeling (SRL) is one shallow semantic parsing that produces predicate-argument structures which are semantic roles (or participants) such as agent, patient, and theme associated with verbs.

Inputs to SRL are tokenized sentences:

````{margin} Batching is Faster
```{hint}
Feed in multiple sentences at once for faster speed! 
```
````


```{code-cell} ipython3
:tags: [output_scroll]
import hanlp

srl = hanlp.load(hanlp.pretrained.srl.CPB3_SRL_ELECTRA_SMALL)
srl(['男孩', '希望', '女孩', '相信', '他', '。'])
```

All the pre-trained labelers and their details are listed below.

```{eval-rst}

.. automodule:: hanlp.pretrained.srl
    :members:

```

================================================
FILE: docs/api/hanlp/pretrained/sts.md
================================================
---
jupytext:
  formats: ipynb,md:myst
  text_representation:
    extension: .md
    format_name: myst
    format_version: '0.8'
    jupytext_version: 1.4.2
kernelspec:
  display_name: Python 3
  language: python
  name: python3
---

# sts

`sts` package holds pre-trained Semantic Textual Similarity (STS) models. We surveyed both supervised and unsupervised
models and we believe that unsupervised models are still immature at this moment. Unsupervised STS is good for IR but 
not NLP especially on sentences with little lexical overlap.
 

```{eval-rst}

.. automodule:: hanlp.pretrained.sts
    :members:

```

```{code-cell} ipython3
import hanlp

sim = hanlp.load(hanlp.pretrained.sts.STS_ELECTRA_BASE_ZH)
sim([
    ['看图猜一电影名', '看图猜电影'],
    ['无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'],
    ['北京到上海的动车票', '上海到北京的动车票'],
])
```

================================================
FILE: docs/api/hanlp/pretrained/tok.md
================================================
---
jupytext:
  formats: ipynb,md:myst
  text_representation:
    extension: .md
    format_name: myst
    format_version: '0.8'
    jupytext_version: 1.4.2
kernelspec:
  display_name: Python 3
  language: python
  name: python3
---

# tok

Tokenization is a way of separating a sentence into smaller units called tokens. In lexical analysis, tokens usually refer to words.

````{margin} Batching is Faster
```{hint}
Tokenize multiple sentences at once for faster speed! 
```
````
````{margin} Custom Dictionary Supported
```{seealso}
See [this tutorial](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/demo_custom_dict.py) for custom dictionary.
```
````

To tokenize raw sentences:


```{code-cell} ipython3
:tags: [output_scroll]
import hanlp

tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
tok(['商品和服务。', '晓美焰来到北京立方庭参观自然语义科技公司'])
```

All the pre-trained tokenizers and their details are listed below.


```{eval-rst}

.. automodule:: hanlp.pretrained.tok
    :members:

```


================================================
FILE: docs/api/hanlp/pretrained/word2vec.md
================================================
---
jupytext:
  formats: ipynb,md:myst
  text_representation:
    extension: .md
    format_name: myst
    format_version: '0.8'
    jupytext_version: 1.4.2
kernelspec:
  display_name: Python 3
  language: python
  name: python3
---

# word2vec

Word2Vec is a family of model architectures and optimizations that can be used to learn word embeddings from large unlabeled datasets. In this document, it is narrowly  defined as a component to map discrete words to distributed representations which are dense vectors.

To perform such mapping:

````{margin} Batching is Faster
```{hint}
Map multiple tokens in batch mode for faster speed! 
```
````

````{margin} Multilingual Support
```{note}
HanLP always support multilingual. Feel free to use a multilingual model listed [here](http://vectors.nlpl.eu/repository/).
```
````

```{code-cell} ipython3
:tags: [output_scroll]
import hanlp
word2vec = hanlp.load(hanlp.pretrained.word2vec.CONVSEG_W2V_NEWS_TENSITE_WORD_PKU)
word2vec('先进')
```

These vectors have already been normalized to facilitate similarity computation:

```{code-cell} ipython3
:tags: [output_scroll]
import torch
print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('优秀'), dim=0))
print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('水果'), dim=0))
```

Using these similarity scores, the most similar words can be found:

```{code-cell} ipython3
:tags: [output_scroll]
word2vec.most_similar('上海')
```

Word2Vec usually can not process OOV or phrases:

```{code-cell} ipython3
:tags: [output_scroll]

word2vec.most_similar('非常寒冷') # phrases are usually OOV
```

Doc2Vec, as opposite to Word2Vec model, can create a vectorised representation by averaging a group of words. To enable Doc2Vec for OOV and phrases, pass `doc2vec=True`:

```{code-cell} ipython3
:tags: [output_scroll]

word2vec.most_similar('非常寒冷', doc2vec=True)
```

All the pre-trained word2vec models and their details are listed below.

```{eval-rst}

.. automodule:: hanlp.pretrained.word2vec
    :members:

```

================================================
FILE: docs/api/hanlp/utils/index.md
================================================
# utils

Utilities.

```{toctree}
io_util
```


================================================
FILE: docs/api/hanlp/utils/io_util.md
================================================
# io_util

```{eval-rst}

.. currentmodule:: hanlp.utils

.. automodule:: hanlp.utils.io_util
	:members:

```


================================================
FILE: docs/api/restful.rst
================================================
.. _api/hanlp_restful:

hanlp_restful
====================

.. currentmodule:: hanlp_restful

.. autoclass:: HanLPClient
	:members:
	:special-members:
	:exclude-members: __init__, __repr__, __weakref__

================================================
FILE: docs/api/restful_golang.md
================================================
# Golang RESTful API

## Install

```shell script
go get -u github.com/hankcs/gohanlp@main
```

## Quick Start 

Obtain an `auth` from any compatible service provider like our [free service](https://bbs.hankcs.com/t/apply-for-free-hanlp-restful-apis/3178), then initiate a `HanLPClient` and call its `Parse` interface.

```java
package main

import (
	"fmt"
	"github.com/hankcs/gohanlp/hanlp"
)

func main() {
    client := hanlp.HanLPClient(hanlp.WithAuth("The auth you applied for")) // anonymous users can skip auth
    s, _ := client.Parse("In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.",hanlp.WithLanguage("mul"))
    fmt.Println(s)
}
```

Refer to our [testcases](https://github.com/hankcs/gohanlp/blob/main/main_test.go) and [data format](../data_format) for more details.


================================================
FILE: docs/api/restful_java.md
================================================
# Java RESTful API

Add the following dependency into the `pom.xml` file of your project. 

```xml
<dependency>
  <groupId>com.hankcs.hanlp.restful</groupId>
  <artifactId>hanlp-restful</artifactId>
  <version>0.0.15</version>
</dependency>
```

Obtain an `auth` from any compatible service provider like our [free service](https://bbs.hankcs.com/t/apply-for-free-hanlp-restful-apis/3178), then initiate a `HanLPClient` and call its `parse` interface.

```java
HanLPClient client = new HanLPClient("https://hanlp.hankcs.com/api", null); // Replace null with your auth
System.out.println(client.parse("2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。晓美焰来到北京立方庭参观自然语义科技公司。"));
```

Refer to our [testcases](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_restful_java/src/test/java/com/hankcs/hanlp/restful/HanLPClientTest.java) and [data format](../data_format) for more details.


================================================
FILE: docs/api/trie/dictionary.md
================================================
# dictionary

```{eval-rst}
.. currentmodule:: hanlp_trie

.. autoclass:: hanlp_trie.dictionary.DictInterface
	:members:

.. autoclass:: hanlp_trie.dictionary.TrieDict
	:members:
```


================================================
FILE: docs/api/trie/index.md
================================================
# hanlp_trie

HanLP trie/dictionary interface and referential implementation.

```{toctree}
trie
dictionary
```


================================================
FILE: docs/api/trie/trie.md
================================================
# trie

```{eval-rst}
.. currentmodule:: hanlp_trie

.. autoclass:: hanlp_trie.trie.Node
	:members:

.. autoclass:: hanlp_trie.trie.Trie
	:members:
```


================================================
FILE: docs/conf.py
================================================
# -- Project information -----------------------------------------------------
import sys
import os
from datetime import datetime

sys.path.append(os.path.abspath('..'))
sys.path.append(os.path.abspath('../plugins/hanlp_common'))
sys.path.append(os.path.abspath('../plugins/hanlp_trie'))
sys.path.append(os.path.abspath('../plugins/hanlp_restful'))
import hanlp

project = 'HanLP'
copyright = f'2020-{datetime.now().year}, hankcs'
author = 'hankcs'

# The short X.Y version.
version = hanlp.__version__
# The full version, including alpha/beta/rc tags.
release = hanlp.__version__

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
language = 'en'

master_doc = "index"

# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    "myst_nb",
    "sphinx_copybutton",
    "sphinx_togglebutton",
    "sphinxcontrib.bibtex",
    'sphinx_astrorefs',  # astrophysics style, similar to ACL
    "sphinx_thebe",
    "sphinx.ext.autodoc",
    "sphinx.ext.intersphinx",
    "sphinx.ext.viewcode",
    "ablog",
    'sphinx.ext.napoleon',
]

# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]

intersphinx_mapping = {
    "python": ("https://docs.python.org/3.8", None),
    "sphinx": ("https://www.sphinx-doc.org/en/3.x", None),
}
nitpick_ignore = [
    ("py:class", "docutils.nodes.document"),
    ("py:class", "docutils.parsers.rst.directives.body.Sidebar"),
]
autoclass_content = 'both'

numfig = True

myst_admonition_enable = True
myst_deflist_enable = True
myst_url_schemes = ("http", "https", "mailto")
panels_add_bootstrap_css = False

# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_book_theme"
html_title = "HanLP Documentation"
html_logo = "_static/logo.png"
html_favicon = "_static/favicon.png"
html_copy_source = True
html_sourcelink_suffix = ""

html_sidebars = {
    # "reference/blog/*": [
    #     "sidebar-search-bs.html",
    #     "postcard.html",
    #     "recentposts.html",
    #     "tagcloud.html",
    #     "categories.html",
    #     "archives.html",
    #     "sbt-sidebar-nav.html",
    #     "sbt-sidebar-footer.html",
    # ]
}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
jupyter_execute_notebooks = "cache"
thebe_config = {
    "repository_url": "https://github.com/binder-examples/jupyter-stacks-datascience",
    "repository_branch": "master",
}

html_theme_options = {
    "theme_dev_mode": False,
    "path_to_docs": "docs",
    "repository_url": "https://github.com/hankcs/HanLP",
    # "repository_branch": "gh-pages",  # For testing
    # "launch_buttons": {
    #     # "binderhub_url": "https://mybinder.org",
    #     # "jupyterhub_url": "https://datahub.berkeley.edu",  # For testing
    #     "colab_url": "https://colab.research.google.com/",
    #     "notebook_interface": "jupyterlab",
    #     "thebe": True,
    # },
    "use_edit_page_button": True,
    "use_issues_button": True,
    "use_repository_button": True,
    "use_download_button": True,
    # For testing
    # "home_page_in_toc": True,
    # "single_page": True,
    # "extra_footer": "<a href='https://google.com'>Test</a>",  # DEPRECATED KEY
    # "extra_navbar": "<a href='https://google.com'>Test</a>",
}
html_baseurl = "https://hanlp.hankcs.com/docs/"

# -- ABlog config -------------------------------------------------
blog_path = "reference/blog"
blog_post_pattern = "reference/blog/*.md"
blog_baseurl = "https://hanlp.hankcs.com/docs/"
fontawesome_included = True
post_auto_image = 1
post_auto_excerpt = 2
execution_show_tb = "READTHEDOCS" in os.environ

# Localization
nb_render_priority = {
    "gettext": (
        "application/vnd.jupyter.widget-view+json",
        "application/javascript",
        "text/html",
        "image/svg+xml",
        "image/png",
        "image/jpeg",
        "text/markdown",
        "text/latex",
        "text/plain",
    )
}

locale_dirs = ['locale/']

# bibtex
bibtex_default_style = 'unsrtalpha'


================================================
FILE: docs/configure.md
================================================
# Configuration

## Customize ``HANLP_HOME``

All resources HanLP use will be cached into a directory called `HANLP_HOME`. 
It is an environment variable which you can customize to any path you like. 
By default, `HANLP_HOME` resolves to `~/.hanlp` and `%appdata%\hanlp` on *nix and Windows respectively. 
If you want to redirect `HANLP_HOME` to a different location, say `/data/hanlp`, the following shell command can be very helpful.

```bash
export HANLP_HOME=/data/hanlp
```

## Use GPUs

By default, HanLP tries to use the least occupied GPU so that mostly you don't need to worry about it, HanLP makes the best choice for you. This behavior is very useful when you're using a public server shared across your lab or company with your colleagues. 

HanLP also honors the ``CUDA_VISIBLE_DEVICES`` used by PyTorch and TensorFlow to limit which devices HanLP can choose from. For example, the following command will only keep the `0`th and `1`st GPUs.

```bash
export CUDA_VISIBLE_DEVICES=0,1
```

```{eval-rst}
If you need fine grained control over each component, ``hanlp.load(..., devices=...)`` is what you're looking for.
See documents for :meth:`hanlp.load`.
```

### External Resources

For deep learning beginners, you might need to learn how to set up a working GPU environment first. Here are some 
resources.

- [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit)
    - It's a good practice to install the driver shipped with a CUDA package. 
- [PyTorch](https://pytorch.org/get-started/locally/)
    - If no existing PyTorch found, `pip install hanlp` will have the CPU-only PyTorch installed, which is universal and assumes no GPU or CUDA dependencies. 
    - You will need to install a GPU-enabled PyTorch according to your CUDA and OS versions.
- Cloud servers
    - There are many cloud services providing out-of-the-box deep learning images. HanLP works fine on these platforms. 
        They could save your time and efforts.
- Google Colab
    - Colab allows you to write excutable notebooks with full GPU support. PyTorch and TensorFlow have been pre-installed and configured to the best state.
    - In fact, you can click [![Open In Colab](https://file.hankcs.com/img/colab-badge.svg)](https://colab.research.google.com/drive/1KPX6t1y36TOzRIeB4Kt3uJ1twuj6WuFv?usp=sharing) to play with the GPU-enabled HanLP tutorial right now.


## Use Mirror Sites

By default, models are downloaded from a global CDN we maintain. However, in some regions the downloading speed can 
be slow occasionally. If you happen to be in one of those regions, you can find some third party mirror sites 
on our [bbs](https://bbs.hankcs.com/). When you find a working URL, say 
[https://ftp.hankcs.com/hanlp/](https://ftp.hankcs.com/hanlp/), you can set a `HANLP_URL` 
environment variable and HanLP will pick it up at the next startup.

```bash
export HANLP_URL=https://ftp.hankcs.com/hanlp/
```

## Control Verbosity

By default, HanLP will print progressive message to the console when you load a model. If you want to silence it, use the 
following environment variable.

```bash
export HANLP_VERBOSE=0
```


================================================
FILE: docs/contributing.md
================================================
# Contributing Guide

Thank you for being interested in contributing to `HanLP`! You
are awesome ✨.

This guideline contains information about our conventions around coding style, pull request workflow, commit messages and more.

This page also contains information to help you get started with development on this
project.

## Development

### Set-up

Get the source code of this project using git:

```bash
git clone https://github.com/hankcs/HanLP --branch master
cd HanLP
pip install -e plugins/hanlp_trie
pip install -e plugins/hanlp_common
pip install -e plugins/hanlp_restful
pip install -e .
```

To work on this project, you need Python 3.6 or newer.

### Running Tests

This project has a test suite to ensure certain important APIs work properly. The tests can be run using:

```bash
python -m unittest discover ./tests
```

```{tip}
It's hard to cover every API especially those of deep learning models, due to the limited computation resource of CI. However, we suggest all inference APIs to be tested at least.
```

## Repository Structure

This repository is a split into a few critical folders:

hanlp/
: The HanLP core package, containing the Python code.

plugins/
: Contains codes shared across several individual packages or non core APIs.

docs/
: The documentation for HanLP, which is in markdown format mostly.
: The build configuration is contained in `conf.py`.

tests/
: Testing infrastructure that uses `unittest` to ensure the output of API is what we expect it to be.

.github/
: Contains Continuous-integration (CI) workflows, run on commits/PRs to the GitHub repository.


================================================
FILE: docs/data_format.md
================================================
---
jupytext:
  formats: ipynb,md:myst
  text_representation:
    extension: .md
    format_name: myst
    format_version: '0.8'
    jupytext_version: 1.4.2
kernelspec:
  display_name: Python 3
  language: python
  name: python3
---

# Data Format


## Input Format

### RESTful Input

#### Definition

To make a RESTful call, one needs to send a `json` HTTP POST request to the server, which contains at least a `text` 
field or a `tokens` field. The input to RESTful API is very flexible. It can be one of the following 3 formats:

1. It can be a document of raw `str` filled into `text`. The server will split it into sentences.
1. It can be a `list` of sentences, each sentence is a raw `str`, filled into `text`.
1. It can be a `list` of tokenized sentences, each sentence is a list of `str` typed tokens, filled into `tokens`.

```{eval-rst}
Additionally, fine-grained controls are performed with the arguments defined in 
:meth:`hanlp_restful.HanLPClient.parse`.
```


#### Examples

```shell script
curl -X 'POST' \
  'https://hanlp.hankcs.com/api/parse' \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
  "language": "zh",
  "text": "HanLP为生产环境带来次世代最先进的多语种NLP技术。晓美焰来到北京参观自然语义科技公司。"
}'
```

### Model Input

````{margin} **How about training inputs?**
```{seealso}
We mostly follow the conventional file format of each NLP task instead of re-inventing them. Thus, we use `.tsv` for tagging and 
`.conllu` for parsing etc. For more details, refer to [datasets](https://hanlp.hankcs.com/docs/api/hanlp/datasets/index.html).   
```
````

The input format to models is specified per model and per task. Generally speaking, if a model has no tokenizer built in, then its input is
a sentence in `list[str]` form (a list of tokens), or multiple such sentences nested in a `list`.

If a model has a tokenizer built in, each sentence is in `str` form. 
Additionally, you can use `skip_tasks='tok*'` to ask the model to use your tokenized inputs instead of tokenizing 
them, in which case, each of your sentence needs to be in `list[str]` form, as if there was no tokenizer.

```{eval-rst}
For any model, its input is of sentence level, which means you have to split a document into sentences beforehand. 
You may want to try :class:`~hanlp.components.eos.ngram.NgramSentenceBoundaryDetector` for sentence splitting.
```

## Output Format


```{eval-rst}
The outputs of both :class:`~hanlp_restful.HanLPClient` and 
:class:`~hanlp.components.mtl.multi_task_learning.MultiTaskLearning` are unified as the same 
:class:`~hanlp_common.document.Document` format.
```

For example, the following RESTful codes will output such an instance.

```{code-cell} ipython3
:tags: [output_scroll]
from hanlp_restful import HanLPClient
HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None)  # Fill in your auth
print(HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。晓美焰来到北京立方庭参观自然语义科技公司。'))
```

The outputs above is represented as a `json` dictionary where each key is a task name and its value is 
the output of the corresponding task.
For each output, if it's a nested `list` then it contains multiple sentences otherwise it's just one single sentence.

We make the following naming convention of NLP tasks, each consists of 3 letters.

````{margin} **How about annotations?**
```{seealso}
Each NLP task can exploit multiple datasets with their annotations, see our [annotations](annotations/index) for details.
```
````

### Naming Convention 

| key  | Task                                                         | Chinese      |
| ---- | ------------------------------------------------------------ | ------------ |
| tok  | Tokenization. Each element is a token.                       | 分词         |
| pos  | Part-of-Speech Tagging. Each element is a tag.               | 词性标注     |
| lem  | Lemmatization. Each element is a lemma.                      | 词干提取     |
| fea  | Features of Universal Dependencies. Each element is a feature. | 词法语法特征 |
| ner  | Named Entity Recognition. Each element is a tuple of `(entity, type, begin, end)`, where `end`s are exclusive offsets. | 命名实体识别 |
| dep  | Dependency Parsing. Each element is a tuple of `(head, relation)` where `head` starts with index `0` (which is `ROOT`). | 依存句法分析 |
| con  | Constituency Parsing. Each list is a bracketed constituent.  | 短语成分分析 |
| srl  | Semantic Role Labeling. Similar to `ner`, each element is a tuple of `(arg/pred, label, begin, end)`, where the predicate is labeled as `PRED`. | 语义角色标注 |
| sdp  | Semantic Dependency Parsing. Similar to `dep`, however each token can have any number (including zero) of heads and corresponding relations. | 语义依存分析 |
| amr  | Abstract Meaning Representation. Each AMR graph is represented as list of logical triples. See [AMR guidelines](https://github.com/amrisi/amr-guidelines/blob/master/amr.md#example). | 抽象意义表示 |

When there are multiple models performing the same task, their keys are appended with a secondary identifier. 
For example, `tok/fine` and `tok/corase` means a fine-grained tokenization model and a coarse-grained one respectively.

================================================
FILE: docs/index.md
================================================
# HanLP: Han Language Processing

[![GitHub stars](https://img.shields.io/github/stars/hankcs/HanLP)](https://github.com/hankcs/HanLP/stargazers) [![GitHub forks](https://img.shields.io/github/forks/hankcs/HanLP)](https://github.com/hankcs/HanLP/network) ![pypi](https://img.shields.io/pypi/v/HanLP) [![Downloads](https://static.pepy.tech/badge/HanLP)](https://pepy.tech/project/HanLP) [![GitHub license](https://img.shields.io/github/license/hankcs/HanLP)](https://github.com/hankcs/HanLP/blob/master/LICENSE) [![Open In Colab](https://file.hankcs.com/img/colab-badge.svg)](https://colab.research.google.com/drive/1KPX6t1y36TOzRIeB4Kt3uJ1twuj6WuFv?usp=sharing)

The multilingual NLP library for researchers and companies, built on PyTorch and TensorFlow 2.x, for advancing 
state-of-the-art deep learning techniques in both academia and industry. HanLP was designed from day one to be 
efficient, user friendly and extendable. It comes with pretrained models for various human languages 
including English, Chinese, Japanese and many others.


## Tutorials

```{toctree}
:maxdepth: 1
:caption: Introduction

tutorial
install
configure
data_format
annotations/index
contributing
Live Demo <https://hanlp.hankcs.com/>
```

## Python API

```{toctree}
:caption: Python API
:maxdepth: 2

api/hanlp/index
api/common/index
api/restful
api/trie/index
```

## Java API

```{toctree}
:maxdepth: 1
:caption: Java API

1.x API <https://github.com/hankcs/HanLP/tree/1.x>
api/restful_java
```

## Golang API

```{toctree}
:maxdepth: 1
:caption: Golang API

api/restful_golang
```

## References

```{toctree}
:caption: References
:maxdepth: 2

references
```


## Acknowledgements

HanLPv2.1 is heavily inspired by [AllenNLP](https://allennlp.org/) and [SuPar](https://pypi.org/project/supar/). 

[pypi-badge]: https://img.shields.io/pypi/v/hanlp.svg
[pypi-link]: https://pypi.org/project/hanlp


================================================
FILE: docs/install.md
================================================
# Install

```{figure} _static/install-versions.svg
---
width: 100%
figclass: caption
alt: HanLP versions
name: hanlp-versions
---
Choose your HanLP version
```

## Install RESTful Packages

[![Downloads](https://static.pepy.tech/badge/hanlp-restful)](https://pepy.tech/project/hanlp-restful) [![Downloads](https://static.pepy.tech/badge/hanlp-restful/month)](https://pepy.tech/project/hanlp-restful) [![Downloads](https://static.pepy.tech/badge/hanlp-restful/week)](https://pepy.tech/project/hanlp-restful) 

```{eval-rst}
.. margin:: **Beginners Attention**

    .. Hint:: New to NLP? Just install RESTful packages and call :meth:`~hanlp_restful.HanLPClient.parse` without pain.
```

For beginners, the recommended RESTful packages are easier to start with. 
The only requirement is [an auth key](https://bbs.hankcs.com/t/apply-for-free-hanlp-restful-apis/3178). 
We officially released the following language bindings:

### Python

```shell script
pip install hanlp_restful
```

### Java

See [Java instructions](https://hanlp.hankcs.com/docs/api/restful_java.html).

### Golang

See [Golang instructions](https://hanlp.hankcs.com/docs/api/restful_golang.html).

## Install Native Package

[![Downloads](https://static.pepy.tech/badge/hanlp)](https://pepy.tech/project/hanlp) [![Downloads](https://static.pepy.tech/badge/hanlp/month)](https://pepy.tech/project/hanlp) [![Downloads](https://static.pepy.tech/badge/hanlp/week)](https://pepy.tech/project/hanlp)  

The native package running locally can be installed via pip.

````{margin} **Install from Source**
```{note}
See [developer guideline](https://hanlp.hankcs.com/docs/contributing.html#development).
```
````

```
pip install hanlp
```

HanLP requires Python 3.6 or later. GPU/TPU is suggested but not mandatory. Depending on your preference, HanLP offers the following flavors:

````{margin} **Windows Support**
```{note}
Installation on Windows is **perfectly** supported. No need to install Microsoft Visual C++ Build Tools anymore. 
```
````

````{margin} **Apple Silicon**
```{note}
HanLP also perfectly supports accelerating on Apple Silicon M1 chips, see [tutorial](https://www.hankcs.com/nlp/hanlp-official-m1-support.html).
```
````

| Flavor  | Description                                                  |
| ------- | ------------------------------------------------------------ |
| default | This installs the default version which delivers the most commonly used functionalities. However, some heavy dependencies like TensorFlow are not installed. |
| tf      | This installs TensorFlow and fastText.                       |
| amr     | To support Abstract Meaning Representation (AMR) models, this installs AMR related dependencies like `penman`. |
| full    | For experts who seek to maximize the efficiency via TensorFlow and C++ extensions, `pip install hanlp[full]` installs all the above dependencies. |


## Install Models

In short, you don't need to manually install any model. Instead, they are automatically downloaded to a directory called [`HANLP_HOME`](https://hanlp.hankcs.com/docs/configure.html#customize-hanlp-home) when you call `hanlp.load`.
Occasionally, some errors might occur the first time you load a model, in which case you can refer to the following tips.

### Download Error

#### HanLP Models

If the auto-download of a HanLP model fails, you can either:

1. Retry as our file server might be busy serving users from all over the world.
1. Follow the message on your terminal, which often guides you to manually download a `zip` file to a particular path. 
1. Use a [mirror site](https://hanlp.hankcs.com/docs/configure.html#use-mirror-sites) which could be faster and stabler in your region.

#### Hugging Face 🤗 Transformers Models

If the auto-download of a Hugging Face 🤗 Transformers model fails, e.g., the following exception is threw out:

```bash
lib/python3.8/site-packages/transformers/file_utils.py", line 2102, in get_from_cache
    raise ValueError(
ValueError: Connection error, and we cannot find the requested files in the cached 
path. Please try again or make sure your Internet connection is on.
```

You can either:

1. Retry as the Internet is quite unstable in some regions (e.g., China).

2. Force Hugging Face 🤗 Transformers to use cached models instead of checking updates from the Internet **if you have ever successfully loaded it before**, by setting the following environment variable:

   ```bash
   export TRANSFORMERS_OFFLINE=1
   ```

### Server without Internet

If your server has no Internet access at all, just debug your codes on your local PC and copy the following directories to your server via a USB disk or something.

1. `~/.hanlp`: the home directory for HanLP models.
1. `~/.cache/huggingface`: the home directory for Hugging Face 🤗 Transformers.


### Import Error

Some TensorFlow/fastText models will ask you to install the missing TensorFlow/fastText modules, in which case you'll need to install the full version:

```shell script
pip install hanlp[full]
```

```{danger}
NEVER install thirdparty packages (TensorFlow/fastText etc.) by yourself, as higher or lower versions of thirparty packages have not been tested and might not work properly.
```

================================================
FILE: docs/references.bib
================================================
%% This BibTeX bibliography file was created using BibDesk.
%% https://bibdesk.sourceforge.io/

%% Created for hankcs at 2022-12-07 15:02:16 -0500 


%% Saved with string encoding Unicode (UTF-8) 


@inproceedings{bai-etal-2022-graph,
	address = {Dublin, Ireland},
	author = {Bai, Xuefeng and Chen, Yulong and Zhang, Yue},
	booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
	date-added = {2022-12-07 15:02:15 -0500},
	date-modified = {2022-12-07 15:02:15 -0500},
	month = may,
	pages = {6001--6015},
	publisher = {Association for Computational Linguistics},
	title = {Graph Pre-training for {AMR} Parsing and Generation},
	url = {https://aclanthology.org/2022.acl-long.415},
	year = {2022},
	bdsk-url-1 = {https://aclanthology.org/2022.acl-long.415}}

@inproceedings{wang-etal-2021-minilmv2,
	address = {Online},
	author = {Wang, Wenhui and Bao, Hangbo and Huang, Shaohan and Dong, Li and Wei, Furu},
	booktitle = {Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021},
	date-added = {2022-06-14 20:10:18 -0400},
	date-modified = {2022-06-14 20:10:18 -0400},
	doi = {10.18653/v1/2021.findings-acl.188},
	month = aug,
	pages = {2140--2151},
	publisher = {Association for Computational Linguistics},
	title = {{M}ini{LM}v2: Multi-Head Self-Attention Relation Distillation for Compressing Pretrained Transformers},
	url = {https://aclanthology.org/2021.findings-acl.188},
	year = {2021},
	bdsk-url-1 = {https://aclanthology.org/2021.findings-acl.188},
	bdsk-url-2 = {https://doi.org/10.18653/v1/2021.findings-acl.188}}

@article{zhang2021mengzi,
	author = {Zhang, Zhuosheng and Zhang, Hanqing and Chen, Keming and Guo, Yuhang and Hua, Jingyun and Wang, Yulong and Zhou, Ming},
	date-added = {2022-04-15 10:32:14 -0400},
	date-modified = {2022-04-15 10:32:14 -0400},
	journal = {arXiv preprint arXiv:2110.06696},
	title = {Mengzi: Towards Lightweight yet Ingenious Pre-trained Models for Chinese},
	year = {2021}}

@inproceedings{samuel-straka-2020-ufal,
	abstract = {We present PERIN, a novel permutation-invariant approach to sentence-to-graph semantic parsing. PERIN is a versatile, cross-framework and language independent architecture for universal modeling of semantic structures. Our system participated in the CoNLL 2020 shared task, Cross-Framework Meaning Representation Parsing (MRP 2020), where it was evaluated on five different frameworks (AMR, DRG, EDS, PTG and UCCA) across four languages. PERIN was one of the winners of the shared task. The source code and pretrained models are available at http://www.github.com/ufal/perin.},
	address = {Online},
	author = {Samuel, David and Straka, Milan},
	booktitle = {Proceedings of the CoNLL 2020 Shared Task: Cross-Framework Meaning Representation Parsing},
	date-added = {2022-04-12 22:36:23 -0400},
	date-modified = {2022-04-12 22:36:23 -0400},
	doi = {10.18653/v1/2020.conll-shared.5},
	month = nov,
	pages = {53--64},
	publisher = {Association for Computational Linguistics},
	title = {{{\'U}FAL} at {MRP} 2020: Permutation-invariant Semantic Parsing in {PERIN}},
	url = {https://aclanthology.org/2020.conll-shared.5},
	year = {2020},
	bdsk-url-1 = {https://aclanthology.org/2020.conll-shared.5},
	bdsk-url-2 = {https://doi.org/10.18653/v1/2020.conll-shared.5}}

@inproceedings{qiu-etal-2014-multi,
	address = {Dublin, Ireland},
	author = {Qiu, Likun and Zhang, Yue and Jin, Peng and Wang, Houfeng},
	booktitle = {Proceedings of {COLING} 2014, the 25th International Conference on Computational Linguistics: Technical Papers},
	date-added = {2022-02-15 04:42:58 -0500},
	date-modified = {2022-02-15 04:42:58 -0500},
	month = aug,
	pages = {257--268},
	publisher = {Dublin City University and Association for Computational Linguistics},
	title = {Multi-view {C}hinese Treebanking},
	url = {https://aclanthology.org/C14-1026},
	year = {2014},
	bdsk-url-1 = {https://aclanthology.org/C14-1026}}

@inproceedings{li-etal-2018-analogical,
	abstract = {Analogical reasoning is effective in capturing linguistic regularities. This paper proposes an analogical reasoning task on Chinese. After delving into Chinese lexical knowledge, we sketch 68 implicit morphological relations and 28 explicit semantic relations. A big and balanced dataset CA8 is then built for this task, including 17813 questions. Furthermore, we systematically explore the influences of vector representations, context features, and corpora on analogical reasoning. With the experiments, CA8 is proved to be a reliable benchmark for evaluating Chinese word embeddings.},
	address = {Melbourne, Australia},
	author = {Li, Shen and Zhao, Zhe and Hu, Renfen and Li, Wensi and Liu, Tao and Du, Xiaoyong},
	booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
	date-added = {2022-01-30 22:52:52 -0500},
	date-modified = {2022-01-30 22:52:52 -0500},
	doi = {10.18653/v1/P18-2023},
	month = jul,
	pages = {138--143},
	publisher = {Association for Computational Linguistics},
	title = {Analogical Reasoning on {C}hinese Morphological and Semantic Relations},
	url = {https://aclanthology.org/P18-2023},
	year = {2018},
	bdsk-url-1 = {https://aclanthology.org/P18-2023},
	bdsk-url-2 = {https://doi.org/10.18653/v1/P18-2023}}

@inproceedings{NIPS2013_9aa42b31,
	author = {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S and Dean, Jeff},
	booktitle = {Advances in Neural Information Processing Systems},
	date-added = {2022-01-30 18:17:28 -0500},
	date-modified = {2022-01-30 18:17:28 -0500},
	editor = {C. J. C. Burges and L. Bottou and M. Welling and Z. Ghahramani and K. Q. Weinberger},
	publisher = {Curran Associates, Inc.},
	title = {Distributed Representations of Words and Phrases and their Compositionality},
	url = {https://proceedings.neurips.cc/paper/2013/file/9aa42b31882ec039965f3c4923ce901b-Paper.pdf},
	volume = {26},
	year = {2013},
	bdsk-url-1 = {https://proceedings.neurips.cc/paper/2013/file/9aa42b31882ec039965f3c4923ce901b-Paper.pdf}}

@inproceedings{bevilacqua-etal-2021-one,
	author = {Bevilacqua, Michele and Blloshmi, Rexhina and Navigli, Roberto},
	booktitle = {Proceedings of AAAI},
	date-added = {2022-01-25 11:58:03 -0500},
	date-modified = {2022-01-25 11:58:03 -0500},
	title = {One {SPRING} to Rule Them Both: {S}ymmetric {AMR} Semantic Parsing and Generation without a Complex Pipeline},
	year = {2021}}

@inproceedings{lewis-etal-2020-bart,
	abstract = {We present BART, a denoising autoencoder for pretraining sequence-to-sequence models. BART is trained by (1) corrupting text with an arbitrary noising function, and (2) learning a model to reconstruct the original text. It uses a standard Tranformer-based neural machine translation architecture which, despite its simplicity, can be seen as generalizing BERT (due to the bidirectional encoder), GPT (with the left-to-right decoder), and other recent pretraining schemes. We evaluate a number of noising approaches, finding the best performance by both randomly shuffling the order of sentences and using a novel in-filling scheme, where spans of text are replaced with a single mask token. BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It matches the performance of RoBERTa on GLUE and SQuAD, and achieves new state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains of up to 3.5 ROUGE. BART also provides a 1.1 BLEU increase over a back-translation system for machine translation, with only target language pretraining. We also replicate other pretraining schemes within the BART framework, to understand their effect on end-task performance.},
	address = {Online},
	author = {Lewis, Mike and Liu, Yinhan and Goyal, Naman and Ghazvininejad, Marjan and Mohamed, Abdelrahman and Levy, Omer and Stoyanov, Veselin and Zettlemoyer, Luke},
	booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
	date-added = {2022-01-25 11:56:10 -0500},
	date-modified = {2022-01-25 11:56:10 -0500},
	doi = {10.18653/v1/2020.acl-main.703},
	month = jul,
	pages = {7871--7880},
	publisher = {Association for Computational Linguistics},
	title = {{BART}: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension},
	url = {https://www.aclweb.org/anthology/2020.acl-main.703},
	year = {2020},
	bdsk-url-1 = {https://www.aclweb.org/anthology/2020.acl-main.703},
	bdsk-url-2 = {https://doi.org/10.18653/v1/2020.acl-main.703}}

@article{knight2014abstract,
	author = {Knight, Kevin and Baranescu, Lauren and Bonial, Claire and Georgescu, Madalina and Griffitt, Kira and Hermjakob, Ulf and Marcu, Daniel and Palmer, Martha and Schneifer, Nathan},
	date-added = {2022-01-25 11:54:11 -0500},
	date-modified = {2022-01-25 11:54:11 -0500},
	journal = {Web download},
	title = {Abstract meaning representation (amr) annotation release 1.0},
	year = {2014}}

@inproceedings{he-choi-2021-stem,
	abstract = {Multi-task learning with transformer encoders (MTL) has emerged as a powerful technique to improve performance on closely-related tasks for both accuracy and efficiency while a question still remains whether or not it would perform as well on tasks that are distinct in nature. We first present MTL results on five NLP tasks, POS, NER, DEP, CON, and SRL, and depict its deficiency over single-task learning. We then conduct an extensive pruning analysis to show that a certain set of attention heads get claimed by most tasks during MTL, who interfere with one another to fine-tune those heads for their own objectives. Based on this finding, we propose the Stem Cell Hypothesis to reveal the existence of attention heads naturally talented for many tasks that cannot be jointly trained to create adequate embeddings for all of those tasks. Finally, we design novel parameter-free probes to justify our hypothesis and demonstrate how attention heads are transformed across the five tasks during MTL through label analysis.},
	address = {Online and Punta Cana, Dominican Republic},
	author = {He, Han and Choi, Jinho D.},
	booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
	date-added = {2021-11-06 18:24:44 -0400},
	date-modified = {2021-11-06 18:24:44 -0400},
	month = nov,
	pages = {5555--5577},
	publisher = {Association for Computational Linguistics},
	title = {The Stem Cell Hypothesis: Dilemma behind Multi-Task Learning with Transformer Encoders},
	url = {https://aclanthology.org/2021.emnlp-main.451},
	year = {2021},
	bdsk-url-1 = {https://aclanthology.org/2021.emnlp-main.451}}

@inproceedings{he-choi-2019,
	abstract = {This paper presents new state-of-the-art models for three tasks, part-of-speech tagging, syntactic parsing, and semantic parsing, using the cutting-edge contextualized embedding framework known as BERT. For each task, we first replicate and simplify the current state-of-the-art approach to enhance its model efficiency. We then evaluate our simplified approaches on those three tasks using token embeddings generated by BERT. 12 datasets in both English and Chinese are used for our experiments. The BERT models outperform the previously best-performing models by 2.5\% on average (7.5\% for the most significant case). All models and source codes are available in public so that researchers can improve upon and utilize them to establish strong baselines for the next decade.},
	author = {Han He and Jinho Choi},
	booktitle = {The Thirty-Third International Flairs Conference},
	conference = {Florida Artificial Intelligence Research Society Conference},
	date-added = {2021-10-16 21:09:00 -0400},
	date-modified = {2021-10-16 21:09:00 -0400},
	keywords = {part-of-speech tagging, syntactic parsing, semantic parsing, Transformer, BERT},
	title = {Establishing Strong Baselines for the New Decade: Sequence Tagging, Syntactic and Semantic Parsing with BERT},
	url = {https://www.aaai.org/ocs/index.php/FLAIRS/FLAIRS20/paper/view/18438},
	year = {2020},
	bdsk-url-1 = {https://www.aaai.org/ocs/index.php/FLAIRS/FLAIRS20/paper/view/18438}}

@inproceedings{xiao-etal-2021-ernie,
	abstract = {Coarse-grained linguistic information, such as named entities or phrases, facilitates adequately representation learning in pre-training. Previous works mainly focus on extending the objective of BERT{'}s Masked Language Modeling (MLM) from masking individual tokens to contiguous sequences of n tokens. We argue that such contiguously masking method neglects to model the intra-dependencies and inter-relation of coarse-grained linguistic information. As an alternative, we propose ERNIE-Gram, an explicitly n-gram masking method to enhance the integration of coarse-grained information into pre-training. In ERNIE-Gram, n-grams are masked and predicted directly using explicit n-gram identities rather than contiguous sequences of n tokens. Furthermore, ERNIE-Gram employs a generator model to sample plausible n-gram identities as optional n-gram masks and predict them in both coarse-grained and fine-grained manners to enable comprehensive n-gram prediction and relation modeling. We pre-train ERNIE-Gram on English and Chinese text corpora and fine-tune on 19 downstream tasks. Experimental results show that ERNIE-Gram outperforms previous pre-training models like XLNet and RoBERTa by a large margin, and achieves comparable results with state-of-the-art methods. The source codes and pre-trained models have been released at https://github.com/PaddlePaddle/ERNIE.},
	address = {Online},
	author = {Xiao, Dongling and Li, Yu-Kun and Zhang, Han and Sun, Yu and Tian, Hao and Wu, Hua and Wang, Haifeng},
	booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
	date-added = {2021-09-04 14:09:52 -0400},
	date-modified = {2021-09-04 14:09:52 -0400},
	doi = {10.18653/v1/2021.naacl-main.136},
	month = jun,
	pages = {1702--1715},
	publisher = {Association for Computational Linguistics},
	title = {{ERNIE}-Gram: Pre-Training with Explicitly N-Gram Masked Language Modeling for Natural Language Understanding},
	url = {https://aclanthology.org/2021.naacl-main.136},
	year = {2021},
	bdsk-url-1 = {https://aclanthology.org/2021.naacl-main.136},
	bdsk-url-2 = {https://doi.org/10.18653/v1/2021.naacl-main.136}}

@inproceedings{akbik-etal-2018-contextual,
	abstract = {Recent advances in language modeling using recurrent neural networks have made it viable to model language as distributions over characters. By learning to predict the next character on the basis of previous characters, such models have been shown to automatically internalize linguistic concepts such as words, sentences, subclauses and even sentiment. In this paper, we propose to leverage the internal states of a trained character language model to produce a novel type of word embedding which we refer to as contextual string embeddings. Our proposed embeddings have the distinct properties that they (a) are trained without any explicit notion of words and thus fundamentally model words as sequences of characters, and (b) are contextualized by their surrounding text, meaning that the same word will have different embeddings depending on its contextual use. We conduct a comparative evaluation against previous embeddings and find that our embeddings are highly useful for downstream tasks: across four classic sequence labeling tasks we consistently outperform the previous state-of-the-art. In particular, we significantly outperform previous work on English and German named entity recognition (NER), allowing us to report new state-of-the-art F1-scores on the CoNLL03 shared task. We release all code and pre-trained language models in a simple-to-use framework to the research community, to enable reproduction of these experiments and application of our proposed embeddings to other tasks: https://github.com/zalandoresearch/flair},
	address = {Santa Fe, New Mexico, USA},
	author = {Akbik, Alan and Blythe, Duncan and Vollgraf, Roland},
	booktitle = {Proceedings of the 27th International Conference on Computational Linguistics},
	date-added = {2021-09-01 13:10:59 -0400},
	date-modified = {2021-09-01 13:10:59 -0400},
	month = aug,
	pages = {1638--1649},
	publisher = {Association for Computational Linguistics},
	title = {Contextual String Embeddings for Sequence Labeling},
	url = {https://aclanthology.org/C18-1139},
	year = {2018},
	bdsk-url-1 = {https://aclanthology.org/C18-1139}}

@inproceedings{he-choi-2021-levi,
	abstract = {Coupled with biaffine decoders, transformers have been effectively adapted to text-to-graph transduction and achieved state-of-the-art performance on AMR parsing. Many prior works, however, rely on the biaffine decoder for either or both arc and label predictions although most features used by the decoder may be learned by the transformer already. This paper presents a novel approach to AMR parsing by combining heterogeneous data (tokens, concepts, labels) as one input to a transformer to learn attention, and use only attention matrices from the transformer to predict all elements in AMR graphs (concepts, arcs, labels). Although our models use significantly fewer parameters than the previous state-of-the-art graph parser, they show similar or better accuracy on AMR 2.0 and 3.0.},
	address = {Online},
	author = {He, Han and Choi, Jinho D.},
	booktitle = {Proceedings of the 17th International Conference on Parsing Technologies and the IWPT 2021 Shared Task on Parsing into Enhanced Universal Dependencies (IWPT 2021)},
	date-added = {2021-09-01 13:09:14 -0400},
	date-modified = {2021-09-01 13:09:14 -0400},
	doi = {10.18653/v1/2021.iwpt-1.5},
	month = aug,
	pages = {50--57},
	publisher = {Association for Computational Linguistics},
	title = {Levi Graph {AMR} Parser using Heterogeneous Attention},
	url = {https://aclanthology.org/2021.iwpt-1.5},
	year = {2021},
	bdsk-url-1 = {https://aclanthology.org/2021.iwpt-1.5},
	bdsk-url-2 = {https://doi.org/10.18653/v1/2021.iwpt-1.5}}

@inproceedings{conneau-etal-2020-unsupervised,
	abstract = {This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +14.6{\%} average accuracy on XNLI, +13{\%} average F1 score on MLQA, and +2.4{\%} F1 score on NER. XLM-R performs particularly well on low-resource languages, improving 15.7{\%} in XNLI accuracy for Swahili and 11.4{\%} for Urdu over previous XLM models. We also present a detailed empirical analysis of the key factors that are required to achieve these gains, including the trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing per-language performance; XLM-R is very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We will make our code and models publicly available.},
	address = {Online},
	author = {Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
	booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
	date-added = {2021-09-01 12:41:50 -0400},
	date-modified = {2021-09-01 12:41:50 -0400},
	doi = {10.18653/v1/2020.acl-main.747},
	month = jul,
	pages = {8440--8451},
	publisher = {Association for Computational Linguistics},
	title = {Unsupervised Cross-lingual Representation Learning at Scale},
	url = {https://aclanthology.org/2020.acl-main.747},
	year = {2020},
	bdsk-url-1 = {https://aclanthology.org/2020.acl-main.747},
	bdsk-url-2 = {https://doi.org/10.18653/v1/2020.acl-main.747}}

@inproceedings{xue-etal-2021-mt5,
	abstract = {The recent {``}Text-to-Text Transfer Transformer{''} (T5) leveraged a unified text-to-text format and scale to attain state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual benchmarks. We also describe a simple technique to prevent {``}accidental translation{''} in the zero-shot setting, where a generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model checkpoints used in this work are publicly available.},
	address = {Online},
	author = {Xue, Linting and Constant, Noah and Roberts, Adam and Kale, Mihir and Al-Rfou, Rami and Siddhant, Aditya and Barua, Aditya and Raffel, Colin},
	booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
	date-added = {2021-09-01 12:40:34 -0400},
	date-modified = {2021-09-01 12:40:34 -0400},
	doi = {10.18653/v1/2021.naacl-main.41},
	month = jun,
	pages = {483--498},
	publisher = {Association for Computational Linguistics},
	title = {m{T}5: A Massively Multilingual Pre-trained Text-to-Text Transformer},
	url = {https://aclanthology.org/2021.naacl-main.41},
	year = {2021},
	bdsk-url-1 = {https://aclanthology.org/2021.naacl-main.41},
	bdsk-url-2 = {https://doi.org/10.18653/v1/2021.naacl-main.41}}

@misc{https://doi.org/10.35111/gvd0-xk91,
	author = {Xue, Nianwen and {Zhang, Xiuhong} and {Jiang, Zixin} and {Palmer, Martha} and {Xia, Fei} and {Chiou, Fu-Dong} and {Chang, Meiyu}},
	date-added = {2021-09-01 12:32:05 -0400},
	date-modified = {2021-09-01 12:36:22 -0400},
	doi = {10.35111/GVD0-XK91},
	publisher = {Linguistic Data Consortium},
	title = {Chinese Treebank 9.0},
	url = {https://catalog.ldc.upenn.edu/LDC2016T13},
	year = {2016},
	bdsk-url-1 = {https://catalog.ldc.upenn.edu/LDC2016T13},
	bdsk-url-2 = {https://doi.org/10.35111/GVD0-XK91}}

@inproceedings{clark2020electra,
	author = {Kevin Clark and Minh-Thang Luong and Quoc V. Le and Christopher D. Manning},
	booktitle = {ICLR},
	date-added = {2021-08-07 15:53:27 -0400},
	date-modified = {2021-08-07 15:53:27 -0400},
	title = {{ELECTRA}: Pre-training Text Encoders as Discriminators Rather Than Generators},
	url = {https://openreview.net/pdf?id=r1xMH1BtvB},
	year = {2020},
	bdsk-url-1 = {https://openreview.net/pdf?id=r1xMH1BtvB}}

@inproceedings{chang-etal-2009-discriminative,
	address = {Boulder, Colorado},
	author = {Chang, Pi-Chuan and Tseng, Huihsin and Jurafsky, Dan and Manning, Christopher D.},
	booktitle = {Proceedings of the Third Workshop on Syntax and Structure in Statistical Translation ({SSST}-3) at {NAACL} {HLT} 2009},
	date-added = {2021-03-17 13:37:03 -0400},
	date-modified = {2021-03-17 13:37:03 -0400},
	month = jun,
	pages = {51--59},
	publisher = {Association for Computational Linguistics},
	title = {Discriminative Reordering with {C}hinese Grammatical Relations Features},
	url = {https://www.aclweb.org/anthology/W09-2307},
	year = {2009},
	bdsk-url-1 = {https://www.aclweb.org/anthology/W09-2307}}

@inproceedings{pennington-etal-2014-glove,
	address = {Doha, Qatar},
	author = {Pennington, Jeffrey and Socher, Richard and Manning, Christopher},
	booktitle = {Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing ({EMNLP})},
	date-added = {2020-12-31 15:07:29 -0500},
	date-modified = {2020-12-31 15:07:29 -0500},
	doi = {10.3115/v1/D14-1162},
	month = oct,
	pages = {1532--1543},
	publisher = {Association for Computational Linguistics},
	title = {{G}lo{V}e: Global Vectors for Word Representation},
	url = {https://www.aclweb.org/anthology/D14-1162},
	year = {2014},
	bdsk-url-1 = {https://www.aclweb.org/anthology/D14-1162},
	bdsk-url-2 = {https://doi.org/10.3115/v1/D14-1162}}

@incollection{he2018dual,
	author = {He, Han and Wu, Lei and Yang, Xiaokun and Yan, Hua and Gao, Zhimin and Feng, Yi and Townsend, George},
	booktitle = {Information Technology-New Generations},
	date-added = {2020-12-31 15:03:58 -0500},
	date-modified = {2020-12-31 15:03:58 -0500},
	pages = {421--426},
	publisher = {Springer},
	title = {Dual long short-term memory networks for sub-character representation learning},
	year = {2018}}

@inproceedings{devlin-etal-2019-bert,
	abstract = {We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5 (7.7 point absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%} absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).},
	address = {Minneapolis, Minnesota},
	author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
	booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
	date-added = {2020-12-31 14:46:54 -0500},
	date-modified = {2020-12-31 14:46:54 -0500},
	doi = {10.18653/v1/N19-1423},
	month = jun,
	pages = {4171--4186},
	publisher = {Association for Computational Linguistics},
	title = {{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding},
	url = {https://www.aclweb.org/anthology/N19-1423},
	year = {2019},
	bdsk-url-1 = {https://www.aclweb.org/anthology/N19-1423},
	bdsk-url-2 = {https://doi.org/10.18653/v1/N19-1423}}

@inproceedings{Lan2020ALBERT:,
	author = {Zhenzhong Lan and Mingda Chen and Sebastian Goodman and Kevin Gimpel and Piyush Sharma and Radu Soricut},
	booktitle = {International Conference on Learning Representations},
	date-added = {2020-12-31 14:44:52 -0500},
	date-modified = {2020-12-31 14:44:52 -0500},
	title = {ALBERT: A Lite BERT for Self-supervised Learning of Language Representations},
	url = {https://openreview.net/forum?id=H1eA7AEtvS},
	year = {2020},
	bdsk-url-1 = {https://openreview.net/forum?id=H1eA7AEtvS}}

@inproceedings{wang-xu-2017-convolutional,
	abstract = {Character-based sequence labeling framework is flexible and efficient for Chinese word segmentation (CWS). Recently, many character-based neural models have been applied to CWS. While they obtain good performance, they have two obvious weaknesses. The first is that they heavily rely on manually designed bigram feature, i.e. they are not good at capturing $n$-gram features automatically. The second is that they make no use of full word information. For the first weakness, we propose a convolutional neural model, which is able to capture rich $n$-gram features without any feature engineering. For the second one, we propose an effective approach to integrate the proposed model with word embeddings. We evaluate the model on two benchmark datasets: PKU and MSR. Without any feature engineering, the model obtains competitive performance {---} 95.7{\%} on PKU and 97.3{\%} on MSR. Armed with word embeddings, the model achieves state-of-the-art performance on both datasets {---} 96.5{\%} on PKU and 98.0{\%} on MSR, without using any external labeled resource.},
	address = {Taipei, Taiwan},
	author = {Wang, Chunqi and Xu, Bo},
	booktitle = {Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers)},
	date-added = {2020-12-31 14:42:35 -0500},
	date-modified = {2020-12-31 14:42:35 -0500},
	month = nov,
	pages = {163--172},
	publisher = {Asian Federation of Natural Language Processing},
	title = {Convolutional Neural Network with Word Embeddings for {C}hinese Word Segmentation},
	url = {https://www.aclweb.org/anthology/I17-1017},
	year = {2017},
	bdsk-url-1 = {https://www.aclweb.org/anthology/I17-1017}}

@article{bojanowski2017enriching,
	author = {Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
	date-added = {2020-12-25 22:31:59 -0500},
	date-modified = {2020-12-25 22:31:59 -0500},
	issn = {2307-387X},
	journal = {Transactions of the Association for Computational Linguistics},
	pages = {135--146},
	title = {Enriching Word Vectors with Subword Information},
	volume = {5},
	year = {2017}}

@article{collins-koo-2005-discriminative,
	author = {Collins, Michael and Koo, Terry},
	date-added = {2020-12-25 17:25:59 -0500},
	date-modified = {2020-12-25 17:25:59 -0500},
	doi = {10.1162/0891201053630273},
	journal = {Computational Linguistics},
	number = {1},
	pages = {25--70},
	title = {Discriminative Reranking for Natural Language Parsing},
	url = {https://www.aclweb.org/anthology/J05-1003},
	volume = {31},
	year = {2005},
	bdsk-url-1 = {https://www.aclweb.org/anthology/J05-1003},
	bdsk-url-2 = {https://doi.org/10.1162/0891201053630273}}

@inproceedings{zhang-clark-2008-tale,
	address = {Honolulu, Hawaii},
	author = {Zhang, Yue and Clark, Stephen},
	booktitle = {Proceedings of the 2008 Conference on Empirical Methods in Natural Language Processing},
	date-added = {2020-12-25 15:10:10 -0500},
	date-modified = {2020-12-25 15:10:10 -0500},
	month = oct,
	pages = {562--571},
	publisher = {Association for Computational Linguistics},
	title = {A Tale of Two Parsers: {I}nvestigating and Combining Graph-based and Transition-based Dependency Parsing},
	url = {https://www.aclweb.org/anthology/D08-1059},
	year = {2008},
	bdsk-url-1 = {https://www.aclweb.org/anthology/D08-1059}}

@inproceedings{pradhan-etal-2012-conll,
	address = {Jeju Island, Korea},
	author = {Pradhan, Sameer and Moschitti, Alessandro and Xue, Nianwen and Uryupina, Olga and Zhang, Yuchen},
	booktitle = {Joint Conference on {EMNLP} and {C}o{NLL} - Shared Task},
	date-added = {2020-12-24 23:42:41 -0500},
	date-modified = {2020-12-24 23:42:41 -0500},
	month = jul,
	pages = {1--40},
	publisher = {Association for Computational Linguistics},
	title = {{C}o{NLL}-2012 Shared Task: Modeling Multilingual Unrestricted Coreference in {O}nto{N}otes},
	url = {https://www.aclweb.org/anthology/W12-4501},
	year = {2012},
	bdsk-url-1 = {https://www.aclweb.org/anthology/W12-4501}}

@inproceedings{levow-2006-third,
	address = {Sydney, Australia},
	author = {Levow, Gina-Anne},
	booktitle = {Proceedings of the Fifth {SIGHAN} Workshop on {C}hinese Language Processing},
	date-added = {2020-12-24 23:21:14 -0500},
	date-modified = {2020-12-24 23:21:14 -0500},
	month = jul,
	pages = {108--117},
	publisher = {Association for Computational Linguistics},
	title = {The Third International {C}hinese Language Processing Bakeoff: Word Segmentation and Named Entity Recognition},
	url = {https://www.aclweb.org/anthology/W06-0115},
	year = {2006},
	bdsk-url-1 = {https://www.aclweb.org/anthology/W06-0115}}

@inproceedings{tjong-kim-sang-de-meulder-2003-introduction,
	author = {Tjong Kim Sang, Erik F. and De Meulder, Fien},
	booktitle = {Proceedings of the Seventh Conference on Natural Language Learning at {HLT}-{NAACL} 2003},
	date-added = {2020-12-24 23:19:00 -0500},
	date-modified = {2020-12-24 23:19:00 -0500},
	pages = {142--147},
	title = {Introduction to the {C}o{NLL}-2003 Shared Task: Language-Independent Named Entity Recognition},
	url = {https://www.aclweb.org/anthology/W03-0419},
	year = {2003},
	bdsk-url-1 = {https://www.aclweb.org/anthology/W03-0419}}

@inproceedings{koehn2005europarl,
	author = {Koehn, Philipp},
	booktitle = {MT summit},
	date-added = {2020-12-24 23:06:03 -0500},
	date-modified = {2020-12-24 23:06:03 -0500},
	organization = {Citeseer},
	pages = {79--86},
	title = {Europarl: A parallel corpus for statistical machine translation},
	volume = {5},
	year = {2005}}

@inproceedings{Schweter:Ahmed:2019,
	author = {Stefan Schweter and Sajawel Ahmed},
	booktitle = {Proceedings of the 15th Conference on Natural Language Processing (KONVENS)},
	date-added = {2020-12-24 23:03:23 -0500},
	date-modified = {2020-12-24 23:03:23 -0500},
	location = {Erlangen, Germany},
	note = {accepted},
	title = {{Deep-EOS: General-Purpose Neural Networks for Sentence Boundary Detection}},
	year = 2019}

@incollection{he2019effective,
	author = {He, Han and Wu, Lei and Yan, Hua and Gao, Zhimin and Feng, Yi and Townsend, George},
	booktitle = {Smart Intelligent Computing and Applications},
	date-added = {2020-12-24 19:35:03 -0500},
	date-modified = {2020-12-24 19:35:03 -0500},
	pages = {133--142},
	publisher = {Springer},
	title = {Effective neural solution for multi-criteria word segmentation},
	year = {2019}}

@inproceedings{dozat2017stanford,
	author = {Dozat, Timothy and Qi, Peng and Manning, Christopher D},
	booktitle = {Proceedings of the CoNLL 2017 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies},
	date-added = {2020-12-24 15:02:18 -0500},
	date-modified = {2020-12-24 15:02:18 -0500},
	pages = {20--30},
	title = {Stanford's graph-based neural dependency parser at the conll 2017 shared task},
	year = {2017}}

@inproceedings{he-etal-2018-jointly,
	abstract = {Recent BIO-tagging-based neural semantic role labeling models are very high performing, but assume gold predicates as part of the input and cannot incorporate span-level features. We propose an end-to-end approach for jointly predicting all predicates, arguments spans, and the relations between them. The model makes independent decisions about what relationship, if any, holds between every possible word-span pair, and learns contextualized span representations that provide rich, shared input features for each decision. Experiments demonstrate that this approach sets a new state of the art on PropBank SRL without gold predicates.},
	address = {Melbourne, Australia},
	author = {He, Luheng and Lee, Kenton and Levy, Omer and Zettlemoyer, Luke},
	booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
	date-added = {2020-12-24 14:23:45 -0500},
	date-modified = {2020-12-24 14:23:45 -0500},
	doi = {10.18653/v1/P18-2058},
	month = jul,
	pages = {364--369},
	publisher = {Association for Computational Linguistics},
	title = {Jointly Predicting Predicates and Arguments in Neural Semantic Role Labeling},
	url = {https://www.aclweb.org/anthology/P18-2058},
	year = {2018},
	bdsk-url-1 = {https://www.aclweb.org/anthology/P18-2058},
	bdsk-url-2 = {https://doi.org/10.18653/v1/P18-2058}}

@inproceedings{yu-etal-2020-named,
	abstract = {Named Entity Recognition (NER) is a fundamental task in Natural Language Processing, concerned with identifying spans of text expressing references to entities. NER research is often focused on flat entities only (flat NER), ignoring the fact that entity references can be nested, as in [Bank of [China]] (Finkel and Manning, 2009). In this paper, we use ideas from graph-based dependency parsing to provide our model a global view on the input via a biaffine model (Dozat and Manning, 2017). The biaffine model scores pairs of start and end tokens in a sentence which we use to explore all spans, so that the model is able to predict named entities accurately. We show that the model works well for both nested and flat NER through evaluation on 8 corpora and achieving SoTA performance on all of them, with accuracy gains of up to 2.2 percentage points.},
	address = {Online},
	author = {Yu, Juntao and Bohnet, Bernd and Poesio, Massimo},
	booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
	date-added = {2020-12-24 13:35:09 -0500},
	date-modified = {2020-12-24 13:35:09 -0500},
	doi = {10.18653/v1/2020.acl-main.577},
	month = jul,
	pages = {6470--6476},
	publisher = {Association for Computational Linguistics},
	title = {Named Entity Recognition as Dependency Parsing},
	url = {https://www.aclweb.org/anthology/2020.acl-main.577},
	year = {2020},
	bdsk-url-1 = {https://www.aclweb.org/anthology/2020.acl-main.577},
	bdsk-url-2 = {https://doi.org/10.18653/v1/2020.acl-main.577}}

@inproceedings{10.1145/1457838.1457895,
	abstract = {Many computer applications require the storage of large amounts of information within the computer's memory where it will be readily available for reference and updating. Quite commonly, more storage space is required than is available in the computer's high-speed working memory. It is, therefore, a common practice to equip computers with magnetic tapes, disks, or drums, or a combination of these to provide additional storage. This additional storage is always slower in operation than the computer's working memory and therefore care must be taken when using it to avoid excessive operating time.},
	address = {New York, NY, USA},
	author = {De La Briandais, Rene},
	booktitle = {Papers Presented at the the March 3-5, 1959, Western Joint Computer Conference},
	date-added = {2020-12-24 13:07:31 -0500},
	date-modified = {2020-12-24 13:07:31 -0500},
	doi = {10.1145/1457838.1457895},
	isbn = {9781450378659},
	location = {San Francisco, California},
	numpages = {4},
	pages = {295--298},
	publisher = {Association for Computing Machinery},
	series = {IRE-AIEE-ACM '59 (Western)},
	title = {File Searching Using Variable Length Keys},
	url = {https://doi.org/10.1145/1457838.1457895},
	year = {1959},
	bdsk-url-1 = {https://doi.org/10.1145/1457838.1457895}}

@article{lafferty2001conditional,
	author = {Lafferty, John and McCallum, Andrew and Pereira, Fernando CN},
	date-added = {2020-12-24 11:46:30 -0500},
	date-modified = {2020-12-24 12:08:29 -0500},
	journal = {Departmental Papers (CIS)},
	title = {Conditional random fields: Probabilistic models for segmenting and labeling sequence data},
	year = {2001}}

@inproceedings{clark-etal-2019-bam,
	abstract = {It can be challenging to train multi-task neural networks that outperform or even match their single-task counterparts. To help address this, we propose using knowledge distillation where single-task models teach a multi-task model. We enhance this training with teacher annealing, a novel method that gradually transitions the model from distillation to supervised learning, helping the multi-task model surpass its single-task teachers. We evaluate our approach by multi-task fine-tuning BERT on the GLUE benchmark. Our method consistently improves over standard single-task and multi-task training.},
	address = {Florence, Italy},
	author = {Clark, Kevin and Luong, Minh-Thang and Khandelwal, Urvashi and Manning, Christopher D. and Le, Quoc V.},
	booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
	date-added = {2020-12-24 11:26:54 -0500},
	date-modified = {2020-12-24 11:26:54 -0500},
	doi = {10.18653/v1/P19-1595},
	month = jul,
	pages = {5931--5937},
	publisher = {Association for Computational Linguistics},
	title = {{BAM}! Born-Again Multi-Task Networks for Natural Language Understanding},
	url = {https://www.aclweb.org/anthology/P19-1595},
	year = {2019},
	bdsk-url-1 = {https://www.aclweb.org/anthology/P19-1595},
	bdsk-url-2 = {https://doi.org/10.18653/v1/P19-1595}}

@inproceedings{kondratyuk-straka-2019-75,
	address = {Hong Kong, China},
	author = {Kondratyuk, Dan and Straka, Milan},
	booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
	date-added = {2020-12-23 23:51:07 -0500},
	date-modified = {2020-12-23 23:51:07 -0500},
	pages = {2779--2795},
	publisher = {Association for Computational Linguistics},
	title = {75 Languages, 1 Model: Parsing Universal Dependencies Universally},
	url = {https://www.aclweb.org/anthology/D19-1279},
	year = {2019},
	bdsk-url-1 = {https://www.aclweb.org/anthology/D19-1279}}

@inproceedings{dozat:17a,
	author = {Dozat, Timothy and Manning, Christopher D.},
	booktitle = {Proceedings of the 5th International Conference on Learning Representations},
	date-added = {2020-12-23 23:46:20 -0500},
	date-modified = {2020-12-23 23:46:20 -0500},
	series = {ICLR'17},
	title = {{Deep Biaffine Attention for Neural Dependency Parsing}},
	url = {https://openreview.net/pdf?id=Hk95PK9le},
	year = {2017},
	bdsk-url-1 = {http://arxiv.org/abs/1611.01734},
	bdsk-url-2 = {https://openreview.net/pdf?id=Hk95PK9le}}

@inproceedings{smith-smith-2007-probabilistic,
	address = {Prague, Czech Republic},
	author = {Smith, David A. and Smith, Noah A.},
	booktitle = {Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning ({EMNLP}-{C}o{NLL})},
	date-added = {2020-12-23 21:46:06 -0500},
	date-modified = {2020-12-23 21:46:06 -0500},
	month = jun,
	pages = {132--140},
	publisher = {Association for Computational Linguistics},
	title = {Probabilistic Models of Nonprojective Dependency Trees},
	url = {https://www.aclweb.org/anthology/D07-1014},
	year = {2007},
	bdsk-url-1 = {https://www.aclweb.org/anthology/D07-1014}}

@inproceedings{ijcai2020-560,
	author = {Zhang, Yu and Zhou, Houquan and Li, Zhenghua},
	booktitle = {Proceedings of the Twenty-Ninth International Joint Conference on Artificial Intelligence, {IJCAI-20}},
	date-added = {2020-12-23 21:36:56 -0500},
	date-modified = {2020-12-23 21:36:56 -0500},
	doi = {10.24963/ijcai.2020/560},
	editor = {Christian Bessiere},
	month = {7},
	note = {Main track},
	pages = {4046--4053},
	publisher = {International Joint Conferences on Artificial Intelligence Organization},
	title = {Fast and Accurate Neural CRF Constituency Parsing},
	url = {https://doi.org/10.24963/ijcai.2020/560},
	year = {2020},
	bdsk-url-1 = {https://doi.org/10.24963/ijcai.2020/560}}

@inproceedings{buchholz-marsi-2006-conll,
	address = {New York City},
	author = {Buchholz, Sabine and Marsi, Erwin},
	booktitle = {Proceedings of the Tenth Conference on Computational Natural Language Learning ({C}o{NLL}-X)},
	date-added = {2020-12-22 22:57:41 -0500},
	date-modified = {2020-12-22 22:57:41 -0500},
	month = jun,
	pages = {149--164},
	publisher = {Association for Computational Linguistics},
	title = {{C}o{NLL}-{X} Shared Task on Multilingual Dependency Parsing},
	url = {https://www.aclweb.org/anthology/W06-2920},
	year = {2006},
	bdsk-url-1 = {https://www.aclweb.org/anthology/W06-2920}}


================================================
FILE: docs/references.rst
================================================
References
==================

.. bibliography:: references.bib
	:cited:
	:style: astrostyle

================================================
FILE: docs/tutorial.md
================================================
---
jupytext:
  formats: ipynb,md:myst
  text_representation:
    extension: .md
    format_name: myst
    format_version: '0.8'
    jupytext_version: 1.4.2
kernelspec:
  display_name: Python 3
  language: python
  name: python3
---

# Tutorial

Natural Language Processing is an exciting field consisting of many closely related tasks like lexical analysis 
and parsing. Each task involves many datasets and models, all requiring a high degree of expertise. 
Things become even more complex when dealing with multilingual text, as there's simply no datasets for some 
low-resource languages. However, with HanLP 2.1, core NLP tasks have been made easy to access and efficient in 
production environments. In this tutorial, we'll walk through the APIs in HanLP step by step. 

HanLP offers out-of-the-box RESTful API and native Python API which share very similar interfaces 
while they are designed for different scenes.

```{code-cell} ipython3
:tags: [remove_cell]

import hanlp_common.constant

hanlp_common.constant.IPYTHON = False  # Avoid pretty_print prints html which doesn't play well with this theme
```

## RESTful API

RESTful API is an endpoint where you send your documents to then get the parsed annotations back. 
We are hosting a **non-commercial** API service and you are welcome to [apply for an auth key](https://bbs.hankcs.com/t/apply-for-free-hanlp-restful-apis/3178). 
An auth key is a password which gives you access to our API and protects our server from being abused. 
Once obtained such an auth key, you can parse your document with our RESTful client which can be installed via:

````{margin} **Non-Commercial**
```{seealso}
Our models and RESTful APIs are under the [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) licence.
```
````

````{margin} **Zero-Shot Learning**
```{note}
Although UD covers 104 languages, OntoNotes (NER, CON, SRL) covers only English, Chinese and Arabic.
So NER/CON/SRL of languages other than the 3 are considered as Zero-Shot and their accuracies can be very low.  
```
````

```bash
pip install hanlp_restful
```

```{eval-rst}
Then initiate a :class:`~hanlp_restful.HanLPClient` with your auth key and send a document to have it parsed.
```

```{code-cell} ipython3
:tags: [output_scroll]
from hanlp_restful import HanLPClient
# Fill in your auth, set language='zh' to use Chinese models
HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None, language='mul')
doc = HanLP('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments. ' \
            '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。' \
            '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。')
print(doc)
```
````{margin} **But what do these annotations mean?**
```{seealso}
See our [data format](data_format) and [annotations](annotations/index) for details.
```
````


## Visualization

```{eval-rst}
The returned :class:`~hanlp_common.document.Document` has a handy method :meth:`~hanlp_common.document.Document.pretty_print` 
which offers visualization in any mono-width text environment. 
```

````{margin} **Non-ASCII**
```{note}
Non-ASCII text might be skewed in terminals but in Jupyter Notebook it will align correctly. 
You can also use our [live demo](https://hanlp.hankcs.com/).
```
````

````{margin} **Non-Projective**
```{note}
Non-projective dependency trees cannot be visualized and won't be printed out at this moment.
```
````

```{code-cell} ipython3
doc.pretty_print()
```

## Native API

### Multi-Task Learning

If you want to run our models locally or you want to implement your own RESTful server, 
you can [install the native API](https://hanlp.hankcs.com/docs/install.html#install-native-package) 
and call it just like the RESTful one.

````{margin} **Sentences Required**
```{seealso}
As MTL doesn't predict sentence boundaries, inputs have to be split beforehand. 
See our [data format](data_format) for details.
```
````

```{code-cell} ipython3
:tags: [output_scroll]
import hanlp
HanLP = hanlp.load(hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE)
print(HanLP(['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.',
             '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
             '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。']))
```

Due to the fact that the service provider is very likely running a different model or having different settings, the
RESTful and native results might be slightly different. 

To process Chinese or Japanese, HanLP provides mono-lingual models in each language which significantly outperform the multi-lingual model. See [docs](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/mtl.html) for the list of models.

### Single-Task Learning

HanLP also provides a full spectrum of single-task learning models for core NLP tasks including tagging and parsing. Please refer to the documentations of  [`pretrained`](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/index.html) models for details.

================================================
FILE: hanlp/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-06-13 18:05
import hanlp.common
import hanlp.components
import hanlp.pretrained
import hanlp.utils
from hanlp.version import __version__

hanlp.utils.ls_resource_in_module(hanlp.pretrained)


def load(save_dir: str, verbose=None, **kwargs) -> hanlp.common.component.Component:
    """Load a pretrained component from an identifier.

    Args:
      save_dir (str): The identifier to the saved component. It could be a remote URL or a local path.
      verbose: ``True`` to print loading progress.
      **kwargs: Arguments passed to :func:`hanlp.common.torch_component.TorchComponent.load`, e.g.,
        ``devices`` is a useful argument to specify which GPU devices a PyTorch component will use.

    Examples::

        import hanlp
        # Load component onto the 0-th GPU.
        hanlp.load(..., devices=0)
        # Load component onto the 0-th and 1-st GPUs using data parallelization.
        hanlp.load(..., devices=[0, 1])

    .. Note::
        A component can have dependencies on other components or resources, which will be recursively loaded. So it's
        common to see multiple downloading messages per single load.

    Returns:
      hanlp.common.component.Component: A pretrained component.

    """
    save_dir = hanlp.pretrained.ALL.get(save_dir, save_dir)
    from hanlp.utils.component_util import load_from_meta_file
    if verbose is None:
        from hanlp_common.constant import HANLP_VERBOSE
        verbose = HANLP_VERBOSE
    return load_from_meta_file(save_dir, 'meta.json', verbose=verbose, **kwargs)


def pipeline(*pipes) -> hanlp.components.pipeline.Pipeline:
    """Creates a pipeline of components. It's made for bundling `KerasComponents`. For `TorchComponent`, use
    :class:`~hanlp.components.mtl.multi_task_learning.MultiTaskLearning` instead.

    Args:
      *pipes: Components if pre-defined any.

    Returns:
      hanlp.components.pipeline.Pipeline: A pipeline, which is a list of components in order.

    """
    return hanlp.components.pipeline.Pipeline(*pipes)


================================================
FILE: hanlp/callbacks/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-05 02:10

================================================
FILE: hanlp/callbacks/fine_csv_logger.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-05 02:12
import copy
from io import TextIOWrapper
from typing import List

import numpy as np
import tensorflow as tf


class StreamTableFormatter(object):

    def __init__(self) -> None:
        super().__init__()
        self.col_widths = None

    def format_row(self, cells) -> List[str]:
        if not isinstance(cells, list):
            cells = list(cells)
        if not self.col_widths:
            self.col_widths = [0] * len([_ for _ in cells])
        for i, c in enumerate(cells):
            self.col_widths[i] = max(self.col_widths[i], len(self.format_cell(c, self.col_widths[i])))
        return list(self.format_cell(cell, width) for cell, width in zip(cells, self.col_widths))

    def format_cell(self, cell: str, min_width) -> str:
        if isinstance(cell, (np.float32, np.float)):
            return '{:>{}.4f}'.format(cell, min_width)
        return '{:>{}}'.format(cell, min_width)


class FineCSVLogger(tf.keras.callbacks.History):

    def __init__(self, filename, separator=',', append=False):
        super().__init__()
        self.append = append
        self.separator = separator
        self.filename = filename
        self.out: TextIOWrapper = None
        self.keys = []
        self.formatter = StreamTableFormatter()

    def on_train_begin(self, logs=None):
        super().on_train_begin(logs)
        self.out = open(self.filename, 'a' if self.append else 'w')

    def on_train_end(self, logs=None):
        self.out.close()

    def on_epoch_end(self, epoch, logs=None):
        super().on_epoch_end(epoch, logs)
        if not self.keys:
            self.keys = sorted(logs.keys())

            if getattr(self.model, 'stop_training', None):
                # We set NA so that csv parsers do not fail for this last epoch.
                logs = dict([(k, logs[k]) if k in logs else (k, 'NA') for k in self.keys])

            # feed them twice to decide the actual width
            values = self.formatter.format_row([epoch + 1] + [logs.get(k, 'NA') for k in self.keys])
            headers = self.formatter.format_row(['epoch'] + self.keys)
            # print headers and bars
            self.out.write(self.separator.join(headers) + '\n')
            # bars for markdown style
            bars = [''.join(['-'] * width) for width in self.formatter.col_widths]
            self.out.write(self.separator.join(bars) + '\n')

        values = self.formatter.format_row([epoch + 1] + [logs.get(k, 'NA') for k in self.keys])
        self.out.write(self.separator.join(values) + '\n')
        self.out.flush()


================================================
FILE: hanlp/common/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-26 14:45


================================================
FILE: hanlp/common/component.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-26 14:45
import inspect
from abc import ABC, abstractmethod
from typing import Any

from hanlp_common.configurable import Configurable


class Component(Configurable, ABC):
    @abstractmethod
    def predict(self, *args, **kwargs):
        """Predict on data. This is the base class for all components, including rule based and statistical ones.

        Args:
          *args: Any type of data subject to sub-classes
          **kwargs: Additional arguments

        Returns: Any predicted annotations.

        """
        raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3]))

    def __call__(self, *args, **kwargs):
        """
        A shortcut for :func:`~hanlp.common.component.predict`.

        Args:
          *args: Any type of data subject to sub-classes
          **kwargs: Additional arguments

        Returns: Any predicted annotations.

        """
        return self.predict(*args, **kwargs)


================================================
FILE: hanlp/common/dataset.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-09 20:27
import math
import os
import random
import tempfile
import warnings
from abc import ABC, abstractmethod
from copy import copy
from logging import Logger
from typing import Union, List, Callable, Iterable, Dict, Any

import torch
import torch.multiprocessing as mp
from hanlp.common.transform import TransformList, VocabDict, EmbeddingNamedTransform
from hanlp.common.vocab import Vocab
from hanlp.components.parsers.alg import kmeans
from hanlp.utils.io_util import read_cells, get_resource
from hanlp.utils.time_util import CountdownTimer
from hanlp.utils.torch_util import dtype_of
from hanlp_common.configurable import AutoConfigurable
from hanlp_common.constant import IDX, HANLP_VERBOSE
from hanlp_common.util import isdebugging, merge_list_of_dict, k_fold
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, Sampler
from torch.utils.data.dataset import IterableDataset


class Transformable(ABC):
    def __init__(self, transform: Union[Callable, List] = None) -> None:
        """An object which can be transformed with a list of functions. It is the final result of an object being passed
        through a list of functions, while these functions are kept in a list.

        Args:
            transform: A transform function or a list of functions.
        """
        super().__init__()
        if isinstance(transform, list) and not isinstance(transform, TransformList):
            transform = TransformList(*transform)
        self.transform: Union[Callable, TransformList] = transform

    def append_transform(self, transform: Callable):
        """Append a transform to its list of transforms.

        Args:
            transform: A new transform to be appended.

        Returns:
            Itself.

        """
        assert transform is not None, 'None transform not allowed'
        if not self.transform:
            self.transform = TransformList(transform)
        elif not isinstance(self.transform, TransformList):
            if self.transform != transform:
                self.transform = TransformList(self.transform, transform)
        else:
            if transform not in self.transform:
                self.transform.append(transform)
        return self

    def insert_transform(self, index: int, transform: Callable):
        """Insert a transform to a certain position.

        Args:
            index: A certain position.
            transform: A new transform.

        Returns:
            Itself.

        """
        assert transform is not None, 'None transform not allowed'
        if not self.transform:
            self.transform = TransformList(transform)
        elif not isinstance(self.transform, TransformList):
            if self.transform != transform:
                self.transform = TransformList(self.transform)
                self.transform.insert(index, transform)
        else:
            if transform not in self.transform:
                self.transform.insert(index, transform)
        return self

    def transform_sample(self, sample: dict, inplace=False) -> dict:
        """Apply transforms to a sample.

        Args:
            sample: A sample, which is a ``dict`` holding features.
            inplace: ``True`` to apply transforms inplace.

        .. Attention::
            If any transform modifies existing features, it will modify again and again when ``inplace=True``.
            For example, if a transform insert a ``BOS`` token to a list inplace, and it is called twice,
            then 2 ``BOS`` will be inserted which might not be an intended result.

        Returns:
            Transformed sample.
        """
        if not inplace:
            sample = copy(sample)
        if self.transform:
            sample = self.transform(sample)
        return sample


class TransformableDataset(Transformable, Dataset, ABC):

    def __init__(self,
                 data: Union[str, List],
                 transform: Union[Callable, List] = None,
                 cache=None,
                 generate_idx=None) -> None:
        """A :class:`~torch.utils.data.Dataset` which can be applied with a list of transform functions.

        Args:
            data: The local or remote path to a dataset, or a list of samples where each sample is a dict.
            transform: Predefined transform(s).
            cache: ``True`` to enable caching, so that transforms won't be called twice.
            generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when
                samples are re-ordered by a sampler.
        """
        super().__init__(transform)
        if generate_idx is None:
            generate_idx = isinstance(data, list)
        data_ = self.load_data(data, generate_idx)
        # assert data_, f'No samples loaded from {data}'
        if data_:
            assert isinstance(data_[0], dict
                              ), f'TransformDataset expects each sample to be a dict but got {type(data_[0])} instead.'
        self.data = data_
        if cache:
            self.cache = [None] * len(data_)
        else:
            self.cache = None

    def load_data(self, data, generate_idx=False):
        """A intermediate step between constructor and calling the actual file loading method.

        Args:
            data: If data is a file, this method calls :meth:`~hanlp.common.dataset.TransformableDataset.load_file`
                to load it.
            generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when
                samples are re-ordered by a sampler.

        Returns: Loaded samples.

        """
        if self.should_load_file(data):
            if isinstance(data, str):
                data = get_resource(data)
            data = list(self.load_file(data))
        if generate_idx:
            for i, each in enumerate(data):
                each[IDX] = i
        # elif isinstance(data, list):
        #     data = self.load_list(data)
        return data

    # noinspection PyMethodMayBeStatic
    # def load_list(self, data: list) -> List[Dict[str, Any]]:
    #     return data

    def should_load_file(self, data) -> bool:
        """Determines whether data is a filepath.

        Args:
            data: Data to check.

        Returns: ``True`` to indicate it's a filepath.

        """
        return isinstance(data, str)

    @abstractmethod
    def load_file(self, filepath: str):
        """The actual file loading logic.

        Args:
            filepath: The path to a dataset.
        """
        pass

    def __getitem__(self, index: Union[int, slice]) -> Union[dict, List[dict]]:
        """ Get the index-th sample in this dataset.

        Args:
            index: Either a integer index of a list of indices.

        Returns: Either a sample or or list of samples depending on how many indices are passed in.

        """
        # if isinstance(index, (list, tuple)):
        #     assert len(index) == 1
        #     index = index[0]
        if isinstance(index, slice):
            indices = range(*index.indices(len(self)))
            return [self[i] for i in indices]

        if self.cache:
            cache = self.cache[index]
            if cache:
                return cache
        sample = self.data[index]
        sample = self.transform_sample(sample)
        if self.cache:
            self.cache[index] = sample
        return sample

    def __len__(self) -> int:
        return len(self.data)

    def __repr__(self) -> str:
        return f'{len(self)} samples: {self[0] if len(self) else ""} ...'

    def purge_cache(self):
        """Purges all cache. If cache is not enabled, this method enables it.
        """
        self.cache = [None] * len(self.data)

    def split(self, *ratios):
        """Split dataset into subsets.

        Args:
            *ratios: The ratios for each subset. They can be any type of numbers which will be normalized. For example,
                    ``8, 1, 1`` are equivalent to ``0.8, 0.1, 0.1``.

        Returns:
            list[TransformableDataset]: A list of subsets.
        """
        ratios = [x / sum(ratios) for x in ratios]
        chunks = []
        prev = 0
        for r in ratios:
            cur = prev + math.ceil(len(self) * r)
            chunks.append([prev, cur])
            prev = cur
        chunks[-1][1] = len(self)
        outputs = []
        for b, e in chunks:
            dataset = copy(self)
            dataset.data = dataset.data[b:e]
            if dataset.cache:
                dataset.cache = dataset.cache[b:e]
            outputs.append(dataset)
        return outputs

    def k_fold(self, k, i):
        """Perform k-fold sampling.

        Args:
            k (int): Number of folds.
            i (int): The i-th fold.

        Returns:
            TransformableDataset: The i-th fold subset of this dataset.

        """
        assert 0 <= i <= k, f'Invalid split {i}'
        train_indices, test_indices = k_fold(k, len(self), i)
        return self.subset(train_indices), self.subset(test_indices)

    def subset(self, indices):
        """Create a subset given indices of samples.

        Args:
            indices: Indices of samples.

        Returns:
            TransformableDataset: The a subset of this dataset.
        """
        dataset = copy(self)
        dataset.data = [dataset.data[i] for i in indices]
        if dataset.cache:
            dataset.cache = [dataset.cache[i] for i in indices]
        return dataset

    def shuffle(self):
        """Shuffle this dataset inplace.
        """
        if not self.cache:
            random.shuffle(self.data)
        else:
            z = list(zip(self.data, self.cache))
            random.shuffle(z)
            self.data, self.cache = zip(*z)

    def prune(self, criterion: Callable, logger: Logger = None):
        """Prune (to discard) samples according to a criterion.

        Args:
            criterion: A functions takes a sample as input and output ``True`` if the sample needs to be pruned.
            logger: If any, log statistical messages using it.

        Returns:
            int: Size before pruning.
        """
        # noinspection PyTypeChecker
        size_before = len(self)
        good_ones = [i for i, s in enumerate(self) if not criterion(s)]
        self.data = [self.data[i] for i in good_ones]
        if self.cache:
            self.cache = [self.cache[i] for i in good_ones]
        if logger:
            size_after = len(self)
            num_pruned = size_before - size_after
            logger.info(f'Pruned [yellow]{num_pruned} ({num_pruned / size_before:.1%})[/yellow] '
                        f'samples out of {size_before}.')
        return size_before


class TransformSequentialDataset(Transformable, IterableDataset, ABC):
    pass


class DeviceDataLoader(DataLoader):
    def __init__(self, dataset, batch_size=32, shuffle=False, sampler=None,
                 batch_sampler=None, num_workers=None, collate_fn=None,
                 pin_memory=False, drop_last=False, timeout=0,
                 worker_init_fn=None, multiprocessing_context=None,
                 device=None, **kwargs):
        if batch_sampler is not None:
            batch_size = 1
        if num_workers is None:
            if isdebugging():
                num_workers = 0
            else:
                num_workers = 2
        # noinspection PyArgumentList
        super(DeviceDataLoader, self).__init__(dataset=dataset, batch_size=batch_size, shuffle=shuffle,
                                               sampler=sampler,
                                               batch_sampler=batch_sampler, num_workers=num_workers,
                                               collate_fn=collate_fn,
                                               pin_memory=pin_memory, drop_last=drop_last, timeout=timeout,
                                               worker_init_fn=worker_init_fn,
                                               multiprocessing_context=multiprocessing_context, **kwargs)
        self.device = device

    def __iter__(self):
        for raw_batch in super(DeviceDataLoader, self).__iter__():
            if self.device is not None:
                for field, data in raw_batch.items():
                    if isinstance(data, torch.Tensor):
                        data = data.to(self.device)
                        raw_batch[field] = data
            yield raw_batch

    def collate_fn(self, samples):
        return merge_list_of_dict(samples)


class PadSequenceDataLoader(DataLoader):

    def __init__(self, dataset, batch_size=32, shuffle=False, sampler=None,
                 batch_sampler=None, num_workers=0, collate_fn=None,
                 pin_memory=False, drop_last=False, timeout=0,
                 worker_init_fn=None, multiprocessing_context=None,
                 pad: dict = None, vocabs: VocabDict = None, device=None, **kwargs):
        """ A dataloader commonly used for NLP tasks. It offers the following convenience.

        - Bachify each field of samples into a :class:`~torch.Tensor` if the field name satisfies the following criterion.
            - Name ends with _id, _ids, _count, _offset, _span, mask
            - Name is in `pad` dict.

        - Pad each field according to field name, the vocabs and pad dict.
        - Move :class:`~torch.Tensor` onto device.

        Args:
            dataset: A :class:`~torch.utils.data.Dataset` to be bachified.
            batch_size: Max size of each batch.
            shuffle: ``True`` to shuffle batches.
            sampler: A :class:`~torch.utils.data.Sampler` to sample samples from data.
            batch_sampler: A :class:`~torch.utils.data.Sampler` to sample batches form all batches.
            num_workers: Number of workers for multi-thread loading. Note that multi-thread loading aren't always
                faster.
            collate_fn: A function to perform batchifying. It must be set to ``None`` in order to make use of the
                 features this class offers.
            pin_memory: If samples are loaded in the Dataset on CPU and would like to be pushed to
                    the GPU, enabling pin_memory can speed up the transfer. It's not useful since most data field are
                    not in Tensor type.
            drop_last: Drop the last batch since it could be half-empty.
            timeout: For multi-worker loading, set a timeout to wait for a worker.
            worker_init_fn: Init function for multi-worker.
            multiprocessing_context: Context for multiprocessing.
            pad: A dict holding field names and their padding values.
            vocabs: A dict of vocabs so padding value can be fetched from it.
            device: The device tensors will be moved onto.
            **kwargs: Other arguments will be passed to :meth:`torch.utils.data.Dataset.__init__`
        """
        if device == -1:
            device = None
        if collate_fn is None:
            collate_fn = self.collate_fn
        if num_workers is None:
            if isdebugging():
                num_workers = 0
            else:
                num_workers = 2
        if batch_sampler is None:
            assert batch_size, 'batch_size has to be specified when batch_sampler is None'
        else:
            batch_size = 1
            shuffle = None
            drop_last = None
        # noinspection PyArgumentList
        super(PadSequenceDataLoader, self).__init__(dataset=dataset, batch_size=batch_size, shuffle=shuffle,
                                                    sampler=sampler,
                                                    batch_sampler=batch_sampler, num_workers=num_workers,
                                                    collate_fn=collate_fn,
                                                    pin_memory=pin_memory, drop_last=drop_last, timeout=timeout,
                                                    worker_init_fn=worker_init_fn,
                                                    multiprocessing_context=multiprocessing_context, **kwargs)
        self.vocabs = vocabs
        if isinstance(dataset, TransformableDataset) and dataset.transform:
            transform = dataset.transform
            if not isinstance(transform, TransformList):
                transform = []
            for each in transform:
                if isinstance(each, EmbeddingNamedTransform):
                    if pad is None:
                        pad = {}
                    if each.dst not in pad:
                        pad[each.dst] = 0
        self.pad = pad
        self.device = device

    def __iter__(self):
        for raw_batch in super(PadSequenceDataLoader, self).__iter__():
            yield self.tensorize(raw_batch, vocabs=self.vocabs, pad_dict=self.pad, device=self.device)

    @staticmethod
    def tensorize(raw_batch: Dict[str, Any], vocabs: VocabDict, pad_dict: Dict[str, int] = None, device=None):
        for field, data in raw_batch.items():
            if isinstance(data, torch.Tensor):
                continue
            vocab_key = field[:-len('_id')] if field.endswith('_id') else None
            vocab: Vocab = vocabs.get(vocab_key, None) if vocabs and vocab_key else None
            if vocab:
                pad = vocab.safe_pad_token_idx
                dtype = torch.long
            elif pad_dict is not None and pad_dict.get(field, None) is not None:
                pad = pad_dict[field]
                dtype = dtype_of(pad)
            elif field.endswith('_offset') or field.endswith('_id') or field.endswith(
                    '_count') or field.endswith('_ids') or field.endswith('_score') or field.endswith(
                '_length') or field.endswith('_span'):
                # guess some common fields to pad
                pad = 0
                dtype = torch.long
            elif field.endswith('_mask'):
                pad = False
                dtype = torch.bool
            else:
                # no need to pad
                continue
            data = PadSequenceDataLoader.pad_data(data, pad, dtype)
            raw_batch[field] = data
        if device is not None:
            for field, data in raw_batch.items():
                if isinstance(data, torch.Tensor):
                    data = data.to(device)
                    raw_batch[field] = data
        return raw_batch

    @staticmethod
    def pad_data(data: Union[torch.Tensor, Iterable], pad, dtype=None, device=None):
        """Perform the actual padding for a given data.

        Args:
            data: Data to be padded.
            pad: Padding value.
            dtype: Data type.
            device: Device to be moved onto.

        Returns:
            torch.Tensor: A ``torch.Tensor``.
        """
        if isinstance(data[0], torch.Tensor):
            data = pad_sequence(data, True, pad)
        elif isinstance(data[0], Iterable):
            inner_is_iterable = False
            for each in data:
                if len(each):
                    if isinstance(each[0], Iterable):
                        inner_is_iterable = True
                        if len(each[0]):
                            if not dtype:
                                dtype = dtype_of(each[0][0])
                    else:
                        inner_is_iterable = False
                        if not dtype:
                            dtype = dtype_of(each[0])
                    break
            if inner_is_iterable:
                max_seq_len = len(max(data, key=len))
                max_word_len = len(max([chars for words in data for chars in words], key=len))
                ids = torch.zeros(len(data), max_seq_len, max_word_len, dtype=dtype, device=device)
                for i, words in enumerate(data):
                    for j, chars in enumerate(words):
                        ids[i][j][:len(chars)] = torch.tensor(chars, dtype=dtype, device=device)
                data = ids
            else:
                data = pad_sequence([torch.tensor(x, dtype=dtype, device=device) for x in data], True, pad)
        elif isinstance(data, list):
            data = torch.tensor(data, dtype=dtype, device=device)
        return data

    def collate_fn(self, samples):
        return merge_list_of_dict(samples)


class CachedDataLoader(object):
    def __init__(self, dataloader: torch.utils.data.DataLoader, filename=None):
        if not filename:
            filename = tempfile.NamedTemporaryFile(prefix='hanlp-cache-', delete=False).name
        self.filename = filename
        self.size = len(dataloader)
        self._build_cache(dataloader)

    def _build_cache(self, dataset, verbose=HANLP_VERBOSE):
        timer = CountdownTimer(self.size)
        with open(self.filename, "wb") as f:
            for i, batch in enumerate(dataset):
                torch.save(batch, f, _use_new_zipfile_serialization=False)
                if verbose:
                    timer.log(f'Caching {self.filename} [blink][yellow]...[/yellow][/blink]')

    def close(self):
        if os.path.isfile(self.filename):
            os.remove(self.filename)

    def __iter__(self):
        with open(self.filename, "rb") as f:
            for i in range(self.size):
                batch = torch.load(f)
                yield batch

    def __len__(self):
        return self.size


def _prefetch_generator(dataloader, queue, batchify=None):
    while True:
        for batch in dataloader:
            if batchify:
                batch = batchify(batch)
            queue.put(batch)


class PrefetchDataLoader(DataLoader):
    def __init__(self, dataloader: torch.utils.data.DataLoader, prefetch: int = 10, batchify: Callable = None) -> None:
        """ A dataloader wrapper which speeds up bachifying using multi-processing. It works best for dataloaders
        of which the bachify takes very long time. But it introduces extra GPU memory consumption since prefetched
        batches are stored in a ``Queue`` on GPU.

        .. Caution::

            PrefetchDataLoader only works in spawn mode with the following initialization code:

            Examples::

                if __name__ == '__main__':
                    import torch

                    torch.multiprocessing.set_start_method('spawn')

            And these 2 lines **MUST** be put into ``if __name__ == '__main__':`` block.

        Args:
            dataloader: A :class:`~torch.utils.data.DatasetLoader` to be prefetched.
            prefetch: Number of batches to prefetch.
            batchify: A bachify function called on each batch of samples. In which case, the inner dataloader shall
                    return samples without really bachify them.
        """
        super().__init__(dataset=dataloader)
        self._batchify = batchify
        self.prefetch = None if isdebugging() else prefetch
        if self.prefetch:
            self._fire_process(dataloader, prefetch)

    def _fire_process(self, dataloader, prefetch):
        self.queue = mp.Queue(prefetch)
        self.process = mp.Process(target=_prefetch_generator, args=(dataloader, self.queue, self._batchify))
        self.process.start()

    def __iter__(self):
        if not self.prefetch:
            for batch in self.dataset:
                if self._batchify:
                    batch = self._batchify(batch)
                yield batch
        else:
            size = len(self)
            while size:
                batch = self.queue.get()
                yield batch
                size -= 1

    def close(self):
        """Close this dataloader and terminates internal processes and queue. It's recommended to call this method to
            ensure a program can gracefully shutdown.
        """
        if self.prefetch:
            self.queue.close()
            self.process.terminate()

    @property
    def batchify(self):
        return self._batchify

    @batchify.setter
    def batchify(self, batchify):
        self._batchify = batchify
        if not self.prefetch:
            prefetch = vars(self.queue).get('maxsize', 10)
            self.close()
            self._fire_process(self.dataset, prefetch)


class BucketSampler(Sampler):
    # noinspection PyMissingConstructor
    def __init__(self, buckets: Dict[float, List[int]], batch_max_tokens, batch_size=None, shuffle=False):
        """A bucketing based sampler which groups samples into buckets then creates batches from each bucket.

        Args:
            buckets: A dict of which keys are some statistical numbers of each bucket, and values are the indices of
                samples in each bucket.
            batch_max_tokens: Maximum tokens per batch.
            batch_size: Maximum samples per batch.
            shuffle: ``True`` to shuffle batches and samples in a batch.
        """
        self.shuffle = shuffle
        self.sizes, self.buckets = zip(*[
            (size, bucket) for size, bucket in buckets.items()
        ])
        # the number of chunks in each bucket, which is clipped by
        # range [1, len(bucket)]
        if batch_size:
            self.chunks = [
                max(batch_size, min(len(bucket), max(round(size * len(bucket) / batch_max_tokens), 1)))
                for size, bucket in zip(self.sizes, self.buckets)
            ]
        else:
            self.chunks = [
                min(len(bucket), max(round(size * len(bucket) / batch_max_tokens), 1))
                for size, bucket in zip(self.sizes, self.buckets)
            ]

    def __iter__(self):
        # if shuffle, shuffle both the buckets and samples in each bucket
        range_fn = torch.randperm if self.shuffle else torch.arange
        for i in range_fn(len(self.buckets)).tolist():
            split_sizes = [(len(self.buckets[i]) - j - 1) // self.chunks[i] + 1 for j in range(self.chunks[i])]
            # DON'T use `torch.chunk` which may return wrong number of chunks
            for batch in range_fn(len(self.buckets[i])).split(split_sizes):
                yield [self.buckets[i][j] for j in batch.tolist()]

    def __len__(self):
        return sum(self.chunks)


class KMeansSampler(BucketSampler):
    def __init__(self, lengths, batch_max_tokens, batch_size=None, shuffle=False, n_buckets=1):
        """A bucket sampler which groups samples using KMeans on their lengths.

        Args:
            lengths: Lengths of each sample, usually measured by number of tokens.
            batch_max_tokens: Maximum tokens per batch.
            batch_size: Maximum samples per batch.
            shuffle: ``True`` to shuffle batches. Samples in the same batch won't be shuffled since the ordered sequence
                    is helpful to speed up RNNs.
            n_buckets: Number of buckets. Clusters in terms of KMeans.
        """
        if n_buckets > len(lengths):
            n_buckets = 1
        self.n_buckets = n_buckets
        self.lengths = lengths
        buckets = dict(zip(*kmeans(self.lengths, n_buckets)))
        super().__init__(buckets, batch_max_tokens, batch_size, shuffle)


class SortingSampler(Sampler):
    # noinspection PyMissingConstructor
    def __init__(self, lengths: List[int], batch_size=None, batch_max_tokens=None, use_effective_tokens=False,
                 shuffle=False) -> None:
        """A sampler which sorts samples according to their lengths. It takes a continuous chunk of sorted samples to
        make a batch. The effective batch size is determined by ``batch_size``, ``batch_max_tokens`` and
        ``use_effective_tokens``.

        Args:
            lengths: Lengths of each sample, usually measured by number of tokens.
            batch_max_tokens: Maximum tokens per batch.
            use_effective_tokens: Whether to calculate the effective number of tokens after padding when applying the
                ``batch_max_tokens``.
            batch_size: Maximum samples per batch.
            shuffle: ``True`` to shuffle batches and samples in a batch.
        """
        # assert any([batch_size, batch_max_tokens]), 'At least one of batch_size and batch_max_tokens is required'
        self.shuffle = shuffle
        self.batch_size = batch_size
        # self.batch_max_tokens = batch_max_tokens
        self.batch_indices = []
        num_tokens = 0
        mini_batch = []
        for i in torch.argsort(torch.tensor(lengths), descending=True).tolist():
            # if batch_max_tokens:
            effective_tokens = lengths[i] if (not mini_batch or not use_effective_tokens) else lengths[mini_batch[0]]
            if (batch_max_tokens is None or num_tokens + effective_tokens <= batch_max_tokens) and (
                    batch_size is None or len(mini_batch) < batch_size):
                mini_batch.append(i)
                num_tokens += effective_tokens
            else:
                if not mini_batch:  # this sequence is longer than  batch_max_tokens
                    mini_batch.append(i)
                    self.batch_indices.append(mini_batch)
                    mini_batch = []
                    num_tokens = 0
                else:
                    self.batch_indices.append(mini_batch)
                    mini_batch = [i]
                    num_tokens = effective_tokens
        if mini_batch:
            self.batch_indices.append(mini_batch)
        # print(len(max(self.batch_indices, key=len)))

    def __iter__(self):
        if self.shuffle:
            random.shuffle(self.batch_indices)
        for batch in self.batch_indices:
            yield batch

    def __len__(self) -> int:
        return len(self.batch_indices)


class SamplerBuilder(AutoConfigurable, ABC):
    @abstractmethod
    def build(self, lengths: List[int], shuffle=False, gradient_accumulation=1, **kwargs) -> Sampler:
        """Build a ``Sampler`` given statistics of samples and other arguments.

        Args:
            lengths: The lengths of samples.
            shuffle: ``True`` to shuffle batches. Note samples in each mini-batch are not necessarily shuffled.
            gradient_accumulation: Number of mini-batches per update step.
            **kwargs: Other arguments to be passed to the constructor of the sampler.
        """
        pass

    def __call__(self, lengths: List[int], shuffle=False, **kwargs) -> Sampler:
        return self.build(lengths, shuffle, **kwargs)

    def scale(self, gradient_accumulation):
        r"""Scale down the ``batch_size`` and ``batch_max_tokens`` to :math:`\frac{1}{\text{gradient_accumulation}}`
        of them respectively.

        Args:
            gradient_accumulation: Number of mini-batches per update step.

        Returns:
            tuple(int,int): batch_size, batch_max_tokens
        """
        batch_size = self.batch_size
        batch_max_tokens = self.batch_max_tokens
        if gradient_accumulation:
            if batch_size:
                batch_size //= gradient_accumulation
            if batch_max_tokens:
                batch_max_tokens //= gradient_accumulation
        return batch_size, batch_max_tokens


class SortingSamplerBuilder(SortingSampler, SamplerBuilder):
    # noinspection PyMissingConstructor
    def __init__(self, batch_size=None, batch_max_tokens=None, use_effective_tokens=False) -> None:
        """Builds a :class:`~hanlp.common.dataset.SortingSampler`.

        Args:
            batch_max_tokens: Maximum tokens per batch.
            use_effective_tokens: Whether to calculate effective number of tokens when applying the `batch_max_tokens`.
            batch_size: Maximum samples per batch.
        """
        self.use_effective_tokens = use_effective_tokens
        self.batch_max_tokens = batch_max_tokens
        self.batch_size = batch_size

    def build(self, lengths: List[int], shuffle=False, gradient_accumulation=1, **kwargs) -> Sampler:
        batch_size, batch_max_tokens = self.scale(gradient_accumulation)
        return SortingSampler(lengths, batch_size, batch_max_tokens, shuffle)

    def __len__(self) -> int:
        return 1


class KMeansSamplerBuilder(KMeansSampler, SamplerBuilder):
    # noinspection PyMissingConstructor
    def __init__(self, batch_max_tokens, batch_size=None, n_buckets=1):
        """Builds a :class:`~hanlp.common.dataset.KMeansSampler`.

        Args:
            batch_max_tokens: Maximum tokens per batch.
            batch_size: Maximum samples per batch.
            n_buckets: Number of buckets. Clusters in terms of KMeans.
        """
        self.n_buckets = n_buckets
        self.batch_size = batch_size
        self.batch_max_tokens = batch_max_tokens

    def build(self, lengths: List[int], shuffle=False, gradient_accumulation=1, **kwargs) -> Sampler:
        batch_size, batch_max_tokens = self.scale(gradient_accumulation)
        return KMeansSampler(lengths, batch_max_tokens, batch_size, shuffle, self.n_buckets)

    def __len__(self) -> int:
        return 1


class TableDataset(TransformableDataset):
    def __init__(self,
                 data: Union[str, List],
                 transform: Union[Callable, List] = None,
                 cache=None,
                 delimiter='auto',
                 strip=True,
                 headers=None) -> None:
        self.headers = headers
        self.strip = strip
        self.delimiter = delimiter
        super().__init__(data, transform, cache)

    def load_file(self, filepath: str):
        for idx, cells in enumerate(read_cells(filepath, strip=self.strip, delimiter=self.delimiter)):
            if not idx and not self.headers:
                self.headers = cells
                if any(len(h) > 32 for h in self.headers):
                    warnings.warn('As you did not pass in `headers` to `TableDataset`, the first line is regarded as '
                                  'headers. However, the length for some headers are too long (>32), which might be '
                                  'wrong. To make sure, pass `headers=...` explicitly.')
            else:
                yield dict(zip(self.headers, cells))


================================================
FILE: hanlp/common/keras_component.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-26 14:45
import logging
import math
import os
import sys
from abc import ABC, abstractmethod
from typing import Optional, List, Any, Dict

import numpy as np
import tensorflow as tf

import hanlp.utils
from hanlp_common.io import save_json, load_json
from hanlp.callbacks.fine_csv_logger import FineCSVLogger
from hanlp.common.component import Component
from hanlp.common.transform_tf import Transform
from hanlp.common.vocab_tf import VocabTF
from hanlp.metrics.chunking.iobes_tf import IOBES_F1_TF
from hanlp.optimizers.adamw import AdamWeightDecay
from hanlp.utils import io_util
from hanlp.utils.io_util import get_resource, tempdir_human
from hanlp.utils.log_util import init_logger, logger
from hanlp.utils.string_util import format_scores
from hanlp.utils.tf_util import format_metrics, size_of_dataset, summary_of_model, get_callback_by_class, NumpyEncoder
from hanlp.utils.time_util import Timer, now_datetime
from hanlp_common.reflection import str_to_type, classpath_of
from hanlp_common.structure import SerializableDict
from hanlp_common.util import merge_dict


class KerasComponent(Component, ABC):
    def __init__(self, transform: Transform) -> None:
        super().__init__()
        self.meta = {
            'class_path': classpath_of(self),
            'hanlp_version': hanlp.version.__version__,
        }
        self.model: Optional[tf.keras.Model] = None
        self.config = SerializableDict()
        self.transform = transform
        # share config with transform for convenience, so we don't need to pass args around
        if self.transform.config:
            for k, v in self.transform.config.items():
                self.config[k] = v
        self.transform.config = self.config

    def evaluate(self, input_path: str, save_dir=None, output=False, batch_size=128, logger: logging.Logger = None,
                 callbacks: List[tf.keras.callbacks.Callback] = None, warm_up=True, verbose=True, **kwargs):
        input_path = get_resource(input_path)
        file_prefix, ext = os.path.splitext(input_path)
        name = os.path.basename(file_prefix)
        if not name:
            name = 'evaluate'
        if save_dir and not logger:
            logger = init_logger(name=name, root_dir=save_dir, level=logging.INFO if verbose else logging.WARN,
                                 mode='w')
        tst_data = self.transform.file_to_dataset(input_path, batch_size=batch_size)
        samples = self.num_samples_in(tst_data)
        num_batches = math.ceil(samples / batch_size)
        if warm_up:
            for x, y in tst_data:
                self.model.predict_on_batch(x)
                break
        if output:
            assert save_dir, 'Must pass save_dir in order to output'
            if isinstance(output, bool):
                output = os.path.join(save_dir, name) + '.predict' + ext
            elif isinstance(output, str):
                output = output
            else:
                raise RuntimeError('output ({}) must be of type bool or str'.format(repr(output)))
        timer = Timer()
        eval_outputs = self.evaluate_dataset(tst_data, callbacks, output, num_batches, **kwargs)
        loss, score, output = eval_outputs[0], eval_outputs[1], eval_outputs[2]
        delta_time = timer.stop()
        speed = samples / delta_time.delta_seconds

        if logger:
            f1: IOBES_F1_TF = None
            for metric in self.model.metrics:
                if isinstance(metric, IOBES_F1_TF):
                    f1 = metric
                    break
            extra_report = ''
            if f1:
                overall, by_type, extra_report = f1.state.result(full=True, verbose=False)
                extra_report = ' \n' + extra_report
            logger.info('Evaluation results for {} - '
                        'loss: {:.4f} - {} - speed: {:.2f} sample/sec{}'
                        .format(name + ext, loss,
                                format_scores(score) if isinstance(score, dict) else format_metrics(self.model.metrics),
                                speed, extra_report))
        if output:
            logger.info('Saving output to {}'.format(output))
            with open(output, 'w', encoding='utf-8') as out:
                self.evaluate_output(tst_data, out, num_batches, self.model.metrics)

        return loss, score, speed

    def num_samples_in(self, dataset):
        return size_of_dataset(dataset)

    def evaluate_dataset(self, tst_data, callbacks, output, num_batches, **kwargs):
        loss, score = self.model.evaluate(tst_data, callbacks=callbacks, steps=num_batches)
        return loss, score, output

    def evaluate_output(self, tst_data, out, num_batches, metrics: List[tf.keras.metrics.Metric]):
        # out.write('x\ty_true\ty_pred\n')
        for metric in metrics:
            metric.reset_states()
        for idx, batch in enumerate(tst_data):
            outputs = self.model.predict_on_batch(batch[0])
            for metric in metrics:
                metric(batch[1], outputs, outputs._keras_mask if hasattr(outputs, '_keras_mask') else None)
            self.evaluate_output_to_file(batch, outputs, out)
            print('\r{}/{} {}'.format(idx + 1, num_batches, format_metrics(metrics)), end='')
        print()

    def evaluate_output_to_file(self, batch, outputs, out):
        for x, y_gold, y_pred in zip(self.transform.X_to_inputs(batch[0]),
                                     self.transform.Y_to_outputs(batch[1], gold=True),
                                     self.transform.Y_to_outputs(outputs, gold=False)):
            out.write(self.transform.input_truth_output_to_str(x, y_gold, y_pred))

    def _capture_config(self, config: Dict,
                        exclude=(
                                'trn_data', 'dev_data', 'save_dir', 'kwargs', 'self', 'logger', 'verbose',
                                'dev_batch_size', '__class__')):
        """
        Save arguments to config

        Parameters
        ----------
        config
            `locals()`
        exclude
        """
        if 'kwargs' in config:
            config.update(config['kwargs'])
        config = dict(
            (key, tf.keras.utils.serialize_keras_object(value)) if hasattr(value, 'get_config') else (key, value) for
            key, value in config.items())
        for key in exclude:
            config.pop(key, None)
        self.config.update(config)

    def save_meta(self, save_dir, filename='meta.json', **kwargs):
        self.meta['create_time']: now_datetime()
        self.meta.update(kwargs)
        save_json(self.meta, os.path.join(save_dir, filename))

    def load_meta(self, save_dir, filename='meta.json'):
        save_dir = get_resource(save_dir)
        metapath = os.path.join(save_dir, filename)
        if os.path.isfile(metapath):
            self.meta.update(load_json(metapath))

    def save_config(self, save_dir, filename='config.json'):
        self.config.save_json(os.path.join(save_dir, filename))

    def load_config(self, save_dir, filename='config.json'):
        save_dir = get_resource(save_dir)
        self.config.load_json(os.path.join(save_dir, filename))

    def save_weights(self, save_dir, filename='model.h5'):
        self.model.save_weights(os.path.join(save_dir, filename))

    def load_weights(self, save_dir, filename='model.h5', **kwargs):
        assert self.model.built or self.model.weights, 'You must call self.model.built() in build_model() ' \
                                                       'in order to load it'
        save_dir = get_resource(save_dir)
        self.model.load_weights(os.path.join(save_dir, filename))

    def save_vocabs(self, save_dir, filename='vocabs.json'):
        vocabs = SerializableDict()
        for key, value in vars(self.transform).items():
            if isinstance(value, VocabTF):
                vocabs[key] = value.to_dict()
        vocabs.save_json(os.path.join(save_dir, filename))

    def load_vocabs(self, save_dir, filename='vocabs.json'):
        save_dir = get_resource(save_dir)
        vocabs = SerializableDict()
        vocabs.load_json(os.path.join(save_dir, filename))
        for key, value in vocabs.items():
            vocab = VocabTF()
            vocab.copy_from(value)
            setattr(self.transform, key, vocab)

    def load_transform(self, save_dir) -> Transform:
        """
        Try to load transform only. This method might fail due to the fact it avoids building the model.
        If it do fail, then you have to use `load` which might be too heavy but that's the best we can do.
        :param save_dir: The path to load.
        """
        save_dir = get_resource(save_dir)
        self.load_config(save_dir)
        self.load_vocabs(save_dir)
        self.transform.build_config()
        self.transform.lock_vocabs()
        return self.transform

    def save(self, save_dir: str, **kwargs):
        self.save_config(save_dir)
        self.save_vocabs(save_dir)
        self.save_weights(save_dir)

    def load(self, save_dir: str, logger=hanlp.utils.log_util.logger, **kwargs):
        self.meta['load_path'] = save_dir
        save_dir = get_resource(save_dir)
        self.load_config(save_dir)
        self.load_vocabs(save_dir)
        self.build(**merge_dict(self.config, training=False, logger=logger, **kwargs, overwrite=True, inplace=True))
        self.load_weights(save_dir, **kwargs)
        self.load_meta(save_dir)

    @property
    def input_shape(self) -> List:
        return self.transform.output_shapes[0]

    def build(self, logger, **kwargs):
        self.transform.build_config()
        self.model = self.build_model(**merge_dict(self.config, training=kwargs.get('training', None),
                                                   loss=kwargs.get('loss', None)))
        self.transform.lock_vocabs()
        optimizer = self.build_optimizer(**self.config)
        loss = self.build_loss(
            **self.config if 'loss' in self.config else dict(list(self.config.items()) + [('loss', None)]))
        # allow for different
        metrics = self.build_metrics(**merge_dict(self.config, metrics=kwargs.get('metrics', 'accuracy'),
                                                  logger=logger, overwrite=True))
        if not isinstance(metrics, list):
            if isinstance(metrics, tf.keras.metrics.Metric):
                metrics = [metrics]
        if not self.model.built:
            sample_inputs = self.sample_data
            if sample_inputs is not None:
                self.model(sample_inputs)
            else:
                if len(self.transform.output_shapes[0]) == 1 and self.transform.output_shapes[0][0] is None:
                    x_shape = self.transform.output_shapes[0]
                else:
                    x_shape = list(self.transform.output_shapes[0])
                    for i, shape in enumerate(x_shape):
                        x_shape[i] = [None] + shape  # batch + X.shape
                self.model.build(input_shape=x_shape)
        self.compile_model(optimizer, loss, metrics)
        return self.model, optimizer, loss, metrics

    def compile_model(self, optimizer, loss, metrics):
        try:
            self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics, run_eagerly=self.config.run_eagerly)
        except ValueError:
            from keras.saving.object_registration import CustomObjectScope
            with CustomObjectScope({'adamweightdecay': AdamWeightDecay}):
                self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics, run_eagerly=self.config.run_eagerly)

    def build_optimizer(self, optimizer, **kwargs) -> tf.keras.optimizers.Optimizer:
        if isinstance(optimizer, (str, dict)):
            custom_objects = {'AdamWeightDecay': AdamWeightDecay}
            try:
                optimizer = tf.keras.utils.deserialize_keras_object(optimizer, module_objects=vars(tf.keras.optimizers),
                                                                    custom_objects=custom_objects)
            except ValueError:
                optimizer['config'].pop('decay', None)
                optimizer = tf.keras.utils.deserialize_keras_object(optimizer, module_objects=vars(tf.keras.optimizers),
                                                                    custom_objects=custom_objects)
        self.config.optimizer = tf.keras.utils.serialize_keras_object(optimizer)
        return optimizer

    def build_loss(self, loss, **kwargs):
        if not loss:
            loss = tf.keras.losses.SparseCategoricalCrossentropy(
                reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE,
                from_logits=True)
        elif isinstance(loss, (str, dict)):
            loss = tf.keras.utils.deserialize_keras_object(loss, module_objects=vars(tf.keras.losses))
        if isinstance(loss, tf.keras.losses.Loss):
            self.config.loss = tf.keras.utils.serialize_keras_object(loss)
        return loss

    def build_transform(self, **kwargs):
        return self.transform

    def build_vocab(self, trn_data, logger):
        train_examples = self.transform.fit(trn_data, **self.config)
        self.transform.summarize_vocabs(logger)
        return train_examples

    def build_metrics(self, metrics, logger: logging.Logger, **kwargs):
        metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
        return [metric]

    @abstractmethod
    def build_model(self, **kwargs) -> tf.keras.Model:
        pass

    def fit(self, trn_data, dev_data, save_dir, batch_size, epochs, run_eagerly=False, logger=None, verbose=True,
            finetune: str = None, **kwargs):
        self._capture_config(locals())
        if sys.version_info >= (3, 10):
            logger.warning(f'Training with TensorFlow {tf.__version__} has not been tested on Python '
                           f'{sys.version_info.major}.{sys.version_info.minor}. Please downgrade to '
                           f'Python<=3.9 in case any compatibility issues arise.')
        self.transform = self.build_transform(**self.config)
        if not save_dir:
            save_dir = tempdir_human()
        if not logger:
            logger = init_logger(name='train', root_dir=save_dir, level=logging.INFO if verbose else logging.WARN)
        logger.info('Hyperparameter:\n' + self.config.to_json())
        num_examples = self.build_vocab(trn_data, logger)
        # assert num_examples, 'You forgot to return the number of training examples in your build_vocab'
        logger.info('Building...')
        train_steps_per_epoch = math.ceil(num_examples / batch_size) if num_examples else None
        self.config.train_steps = train_steps_per_epoch * epochs if num_examples else None
        model, optimizer, loss, metrics = self.build(**merge_dict(self.config, logger=logger, training=True))
        logger.info('Model built:\n' + summary_of_model(self.model))
        if finetune:
            finetune = get_resource(finetune)
            if os.path.isdir(finetune):
                finetune = os.path.join(finetune, 'model.h5')
            model.load_weights(finetune, by_name=True, skip_mismatch=True)
            logger.info(f'Loaded pretrained weights from {finetune} for finetuning')
        self.save_config(save_dir)
        self.save_vocabs(save_dir)
        self.save_meta(save_dir)
        trn_data = self.build_train_dataset(trn_data, batch_size, num_examples)
        dev_data = self.build_valid_dataset(dev_data, batch_size)
        callbacks = self.build_callbacks(save_dir, **merge_dict(self.config, overwrite=True, logger=logger))
        # need to know #batches, otherwise progbar crashes
        dev_steps = math.ceil(self.num_samples_in(dev_data) / batch_size)
        checkpoint = get_callback_by_class(callbacks, tf.keras.callbacks.ModelCheckpoint)
        timer = Timer()
        try:
            history = self.train_loop(**merge_dict(self.config, trn_data=trn_data, dev_data=dev_data, epochs=epochs,
                                                   num_examples=num_examples,
                                                   train_steps_per_epoch=train_steps_per_epoch, dev_steps=dev_steps,
                                                   callbacks=callbacks, logger=logger, model=model, optimizer=optimizer,
                                                   loss=loss,
                                                   metrics=metrics, overwrite=True))
        except KeyboardInterrupt:
            print()
            if not checkpoint or checkpoint.best in (np.Inf, -np.Inf):
                self.save_weights(save_dir)
                logger.info('Aborted with model saved')
            else:
                logger.info(f'Aborted with model saved with best {checkpoint.monitor} = {checkpoint.best:.4f}')
            # noinspection PyTypeChecker
            history: tf.keras.callbacks.History() = get_callback_by_class(callbacks, tf.keras.callbacks.History)
        delta_time = timer.stop()
        best_epoch_ago = 0
        if history and hasattr(history, 'epoch'):
            trained_epoch = len(history.epoch)
            logger.info('Trained {} epochs in {}, each epoch takes {}'.
                        format(trained_epoch, delta_time, delta_time / trained_epoch if trained_epoch else delta_time))
            save_json(history.history, io_util.path_join(save_dir, 'history.json'), cls=NumpyEncoder)
            monitor_history: List = history.history.get(checkpoint.monitor, None)
            if monitor_history:
                best_epoch_ago = len(monitor_history) - monitor_history.index(checkpoint.best)
            if checkpoint and monitor_history and checkpoint.best != monitor_history[-1]:
                logger.info(f'Restored the best model saved with best '
                            f'{checkpoint.monitor} = {checkpoint.best:.4f} '
                            f'saved {best_epoch_ago} epochs ago')
                self.load_weights(save_dir)  # restore best model
        return history

    def train_loop(self, trn_data, dev_data, epochs, num_examples, train_steps_per_epoch, dev_steps, model, optimizer,
                   loss, metrics, callbacks,
                   logger, **kwargs):
        history = self.model.fit(trn_data, epochs=epochs, steps_per_epoch=train_steps_per_epoch,
                                 validation_data=dev_data,
                                 callbacks=callbacks,
                                 validation_steps=dev_steps,
                                 )  # type:tf.keras.callbacks.History
        return history

    def build_valid_dataset(self, dev_data, batch_size):
        dev_data = self.transform.file_to_dataset(dev_data, batch_size=batch_size, shuffle=False)
        return dev_data

    def build_train_dataset(self, trn_data, batch_size, num_examples):
        trn_data = self.transform.file_to_dataset(trn_data, batch_size=batch_size,
                                                  shuffle=True,
                                                  repeat=-1 if self.config.train_steps else None)
        return trn_data

    def build_callbacks(self, save_dir, logger, **kwargs):
        metrics = kwargs.get('metrics', 'accuracy')
        if isinstance(metrics, (list, tuple)):
            metrics = metrics[-1]
        monitor = f'val_{metrics}'
        checkpoint = tf.keras.callbacks.ModelCheckpoint(
            os.path.join(save_dir, 'model.h5'),
            # verbose=1,
            monitor=monitor, save_best_only=True,
            mode='max',
            save_weights_only=True)
        logger.debug(f'Monitor {checkpoint.monitor} for checkpoint')
        tensorboard_callback = tf.keras.callbacks.TensorBoard(
            log_dir=io_util.makedirs(io_util.path_join(save_dir, 'logs')))
        csv_logger = FineCSVLogger(os.path.join(save_dir, 'train.log'), separator=' | ', append=True)
        callbacks = [checkpoint, tensorboard_callback, csv_logger]
        lr_decay_per_epoch = self.config.get('lr_decay_per_epoch', None)
        if lr_decay_per_epoch:
            learning_rate = self.model.optimizer.get_config().get('learning_rate', None)
            if not learning_rate:
                logger.warning('Learning rate decay not supported for optimizer={}'.format(repr(self.model.optimizer)))
            else:
                logger.debug(f'Created LearningRateScheduler with lr_decay_per_epoch={lr_decay_per_epoch}')
                callbacks.append(tf.keras.callbacks.LearningRateScheduler(
                    lambda epoch: learning_rate / (1 + lr_decay_per_epoch * epoch)))
        anneal_factor = self.config.get('anneal_factor', None)
        if anneal_factor:
            callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(factor=anneal_factor,
                                                                  patience=self.config.get('anneal_patience', 10)))
        early_stopping_patience = self.config.get('early_stopping_patience', None)
        if early_stopping_patience:
            callbacks.append(tf.keras.callbacks.EarlyStopping(monitor=monitor, mode='max',
                                                              verbose=1,
                                                              patience=early_stopping_patience))
        return callbacks

    def on_train_begin(self):
        """
        Callback before the training starts
        """
        pass

    def predict(self, data: Any, batch_size=None, **kwargs):
        assert self.model, 'Please call fit or load before predict'
        if not data:
            return []
        data, flat = self.transform.input_to_inputs(data)

        if not batch_size:
            batch_size = self.config.batch_size

        dataset = self.transform.inputs_to_dataset(data, batch_size=batch_size, gold=kwargs.get('gold', False))

        results = []
        num_samples = 0
        data_is_list = isinstance(data, list)
        for idx, batch in enumerate(dataset):
            samples_in_batch = tf.shape(batch[-1] if isinstance(batch[-1], tf.Tensor) else batch[-1][0])[0]
            if data_is_list:
                inputs = data[num_samples:num_samples + samples_in_batch]
            else:
                inputs = None  # if data is a generator, it's usually one-time, not able to transform into a list
            for output in self.predict_batch(batch, inputs=inputs, **kwargs):
                results.append(output)
            num_samples += samples_in_batch
        self.transform.cleanup()

        if flat:
            return results[0]
        return results

    def predict_batch(self, batch, inputs=None, **kwargs):
        X = batch[0]
        Y = self.model.predict_on_batch(X)
        for output in self.transform.Y_to_outputs(Y, X=X, inputs=inputs, batch=batch, **kwargs):
            yield output

    @property
    def sample_data(self):
        return None

    @staticmethod
    def from_meta(meta: dict, **kwargs):
        """

        Parameters
        ----------
        meta
        kwargs

        Returns
        -------
        KerasComponent

        """
        cls = str_to_type(meta['class_path'])
        obj: KerasComponent = cls()
        assert 'load_path' in meta, f'{meta} doesn\'t contain load_path field'
        obj.load(meta['load_path'])
        return obj

    def export_model_for_serving(self, export_dir=None, version=1, overwrite=False, show_hint=False):
        assert self.model, 'You have to fit or load a model before exporting it'
        if not export_dir:
            assert 'load_path' in self.meta, 'When not specifying save_dir, load_path has to present'
            export_dir = get_resource(self.meta['load_path'])
        model_path = os.path.join(export_dir, str(version))
        if os.path.isdir(model_path) and not overwrite:
            logger.info(f'{model_path} exists, skip since overwrite = {overwrite}')
            return export_dir
        logger.info(f'Exporting to {export_dir} ...')
        tf.saved_model.save(self.model, model_path)
        logger.info(f'Successfully exported model to {export_dir}')
        if show_hint:
            logger.info(f'You can serve it through \n'
                        f'tensorflow_model_server --model_name={os.path.splitext(os.path.basename(self.meta["load_path"]))[0]} '
                        f'--model_base_path={export_dir} --rest_api_port=8888')
        return export_dir

    def serve(self, export_dir=None, grpc_port=8500, rest_api_port=0, overwrite=False, dry_run=False):
        export_dir = self.export_model_for_serving(export_dir, show_hint=False, overwrite=overwrite)
        if not dry_run:
            del self.model  # free memory
        logger.info('The inputs of exported model is shown below.')
        os.system(f'saved_model_cli show --all --dir {export_dir}/1')
        cmd = f'nohup tensorflow_model_server --model_name={os.path.splitext(os.path.basename(self.meta["load_path"]))[0]} ' \
              f'--model_base_path={export_dir} --port={grpc_port} --rest_api_port={rest_api_port} ' \
              f'>serve.log 2>&1 &'
        logger.info(f'Running ...\n{cmd}')
        if not dry_run:
            os.system(cmd)


================================================
FILE: hanlp/common/structure.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-26 14:58
from typing import Dict

from hanlp_common.configurable import Configurable
from hanlp_common.reflection import classpath_of
from hanlp_common.structure import SerializableDict


class ConfigTracker(Configurable):

    def __init__(self, locals_: Dict, exclude=('kwargs', 'self', '__class__', 'locals_')) -> None:
        """This base class helps sub-classes to capture their arguments passed to ``__init__``, and also their types so
        that they can be deserialized from a config in dict form.

        Args:
            locals_: Obtained by :meth:`locals`.
            exclude: Arguments to be excluded.

        Examples:
            >>> class MyClass(ConfigTracker):
            >>>     def __init__(self, i_need_this='yes') -> None:
            >>>         super().__init__(locals())
            >>> obj = MyClass()
            >>> print(obj.config)
            {'i_need_this': 'yes', 'classpath': 'test_config_tracker.MyClass'}

        """
        if 'kwargs' in locals_:
            locals_.update(locals_['kwargs'])
        self.config = SerializableDict(
            (k, v.config if hasattr(v, 'config') else v) for k, v in locals_.items() if k not in exclude)
        self.config['classpath'] = classpath_of(self)


class History(object):
    def __init__(self):
        """ A history of training context. It records how many steps have passed and provides methods to decide whether
        an update should be performed, and to caculate number of training steps given dataloader size and
        ``gradient_accumulation``.
        """
        self.num_mini_batches = 0

    def step(self, gradient_accumulation):
        """ Whether the training procedure should perform an update.

        Args:
            gradient_accumulation: Number of batches per update.

        Returns:
            bool: ``True`` to update.
        """
        self.num_mini_batches += 1
        return self.num_mini_batches % gradient_accumulation == 0

    def num_training_steps(self, num_batches, gradient_accumulation):
        """ Caculate number of training steps.

        Args:
            num_batches: Size of dataloader.
            gradient_accumulation: Number of batches per update.

        Returns:

        """
        return len(
            [i for i in range(self.num_mini_batches + 1, self.num_mini_batches + num_batches + 1) if
             i % gradient_accumulation == 0])


================================================
FILE: hanlp/common/torch_component.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-08 21:20
import logging
import os
import re
import time
from abc import ABC, abstractmethod
from typing import Optional, Dict, List, Union, Callable

import torch
from torch import nn
from torch.utils.data import DataLoader

import hanlp
from hanlp.common.component import Component
from hanlp.common.dataset import TransformableDataset
from hanlp.common.transform import VocabDict
from hanlp.utils.io_util import get_resource, basename_no_ext
from hanlp.utils.log_util import init_logger, flash
from hanlp.utils.torch_util import cuda_devices, set_seed
from hanlp_common.configurable import Configurable
from hanlp_common.constant import IDX, HANLP_VERBOSE
from hanlp_common.reflection import classpath_of
from hanlp_common.structure import SerializableDict
from hanlp_common.util import merge_dict, isdebugging


class TorchComponent(Component, ABC):
    def __init__(self, **kwargs) -> None:
        """The base class for all components using PyTorch as backend. It provides common workflows of building vocabs,
        datasets, dataloaders and models. These workflows are more of a conventional guideline than en-forced
        protocols, which means subclass has the freedom to override or completely skip some steps.

        Args:
            **kwargs: Addtional arguments to be stored in the ``config`` property.
        """
        super().__init__()
        self.model: Optional[torch.nn.Module] = None
        self.config = SerializableDict(**kwargs)
        self.vocabs = VocabDict()

    def _capture_config(self, locals_: Dict,
                        exclude=(
                                'trn_data', 'dev_data', 'save_dir', 'kwargs', 'self', 'logger', 'verbose',
                                'dev_batch_size', '__class__', 'devices', 'eval_trn')):
        """Save arguments to config

        Args:
          locals_: Dict: 
          exclude:  (Default value = ('trn_data')
          'dev_data': 
          'save_dir': 
          'kwargs': 
          'self': 
          'logger': 
          'verbose': 
          'dev_batch_size': 
          '__class__': 
          'devices'): 

        Returns:

        
        """
        if 'kwargs' in locals_:
            locals_.update(locals_['kwargs'])
        locals_ = dict((k, v) for k, v in locals_.items() if k not in exclude and not k.startswith('_'))
        self.config.update(locals_)
        return self.config

    def save_weights(self, save_dir, filename='model.pt', trainable_only=True, **kwargs):
        """Save model weights to a directory.

        Args:
            save_dir: The directory to save weights into.
            filename: A file name for weights.
            trainable_only: ``True`` to only save trainable weights. Useful when the model contains lots of static
                embeddings.
            **kwargs: Not used for now.
        """
        model = self.model_
        state_dict = model.state_dict()
        if trainable_only:
            trainable_names = set(n for n, p in model.named_parameters() if p.requires_grad)
            state_dict = dict((n, p) for n, p in state_dict.items() if n in trainable_names)
        torch.save(state_dict, os.path.join(save_dir, filename))

    def load_weights(self, save_dir, filename='model.pt', **kwargs):
        """Load weights from a directory.

        Args:
            save_dir: The directory to load weights from.
            filename: A file name for weights.
            **kwargs: Not used.
        """
        save_dir = get_resource(save_dir)
        filename = os.path.join(save_dir, filename)
        # flash(f'Loading model: {filename} [blink]...[/blink][/yellow]')
        try:
            self.model_.load_state_dict(torch.load(filename, map_location='cpu', weights_only=True), strict=False)
        except TypeError:
            self.model_.load_state_dict(torch.load(filename, map_location='cpu'), strict=False)
        # flash('')

    def save_config(self, save_dir, filename='config.json'):
        """Save config into a directory.

        Args:
            save_dir: The directory to save config.
            filename: A file name for config.
        """
        self._savable_config.save_json(os.path.join(save_dir, filename))

    def load_config(self, save_dir, filename='config.json', **kwargs):
        """Load config from a directory.

        Args:
            save_dir: The directory to load config.
            filename: A file name for config.
            **kwargs: K-V pairs to override config.
        """
        save_dir = get_resource(save_dir)
        self.config.load_json(os.path.join(save_dir, filename))
        self.config.update(kwargs)  # overwrite config loaded from disk
        for k, v in self.config.items():
            if isinstance(v, dict) and 'classpath' in v:
                self.config[k] = Configurable.from_config(v)
        self.on_config_ready(**self.config, save_dir=save_dir)

    def save_vocabs(self, save_dir, filename='vocabs.json'):
        """Save vocabularies to a directory.

        Args:
            save_dir: The directory to save vocabularies.
            filename:  The name for vocabularies.
        """
        if hasattr(self, 'vocabs'):
            self.vocabs.save_vocabs(save_dir, filename)

    def load_vocabs(self, save_dir, filename='vocabs.json'):
        """Load vocabularies from a directory.

        Args:
            save_dir: The directory to load vocabularies.
            filename:  The name for vocabularies.
        """
        if hasattr(self, 'vocabs'):
            self.vocabs = VocabDict()
            self.vocabs.load_vocabs(save_dir, filename)

    def save(self, save_dir: str, **kwargs):
        """Save this component to a directory.

        Args:
            save_dir: The directory to save this component.
            **kwargs: Not used.
        """
        self.save_config(save_dir)
        self.save_vocabs(save_dir)
        self.save_weights(save_dir)

    def load(self, save_dir: str, devices=None, verbose=HANLP_VERBOSE, **kwargs):
        """Load from a local/remote component.

        Args:
            save_dir: An identifier which can be a local path or a remote URL or a pre-defined string.
            devices: The devices this component will be moved onto.
            verbose: ``True`` to log loading progress.
            **kwargs: To override some configs.
        """
        save_dir = get_resource(save_dir)
        # flash('Loading config and vocabs [blink][yellow]...[/yellow][/blink]')
        if devices is None and self.model:
            devices = self.devices
        self.load_config(save_dir, **kwargs)
        self.load_vocabs(save_dir)
        if verbose:
            flash('Building model [blink][yellow]...[/yellow][/blink]')
        self.config.pop('training', None)  # Some legacy versions accidentally put training into config file
        self.model = self.build_model(
            **merge_dict(self.config, **kwargs, overwrite=True, inplace=True), training=False, save_dir=save_dir)
        if verbose:
            flash('')
        self.load_weights(save_dir, **kwargs)
        self.to(devices, verbose=verbose)
        self.model.eval()

    def fit(self,
            trn_data,
            dev_data,
            save_dir,
            batch_size,
            epochs,
            devices=None,
            logger=None,
            seed=None,
            finetune: Union[bool, str] = False,
            eval_trn=True,
            _device_placeholder=False,
            **kwargs):
        """Fit to data, triggers the training procedure. For training set and dev set, they shall be local or remote
        files.

        Args:
            trn_data: Training set.
            dev_data: Development set.
            save_dir: The directory to save trained component.
            batch_size: The number of samples in a batch.
            epochs: Number of epochs.
            devices: Devices this component will live on.
            logger: Any :class:`logging.Logger` instance.
            seed: Random seed to reproduce this training.
            finetune: ``True`` to load from ``save_dir`` instead of creating a randomly initialized component. ``str``
                to specify a different ``save_dir`` to load from.
            eval_trn: Evaluate training set after each update. This can slow down the training but provides a quick
                diagnostic for debugging.
            _device_placeholder: ``True`` to create a placeholder tensor which triggers PyTorch to occupy devices so
                other components won't take these devices as first choices.
            **kwargs: Hyperparameters used by sub-classes.

        Returns:
            Any results sub-classes would like to return. Usually the best metrics on training set.

        """
        # Common initialization steps
        config = self._capture_config(locals())
        if not logger:
            logger = self.build_logger('train', save_dir)
        if seed is None:
            self.config.seed = 233 if isdebugging() else int(time.time())
        set_seed(self.config.seed)
        logger.info(self._savable_config.to_json(sort=True))
        if isinstance(devices, list) or devices is None or isinstance(devices, float):
            flash('[yellow]Querying CUDA devices [blink]...[/blink][/yellow]')
            devices = -1 if isdebugging() else cuda_devices(devices)
            flash('')
        # flash(f'Available GPUs: {devices}')
        if isinstance(devices, list):
            first_device = (devices[0] if devices else -1)
        elif isinstance(devices, dict):
            first_device = next(iter(devices.values()))
        elif isinstance(devices, int):
            first_device = devices
        else:
            first_device = -1
        if _device_placeholder and first_device >= 0:
            _dummy_placeholder = self._create_dummy_placeholder_on(first_device)
        if finetune:
            if isinstance(finetune, str):
                self.load(finetune, devices=devices)
            else:
                self.load(save_dir, devices=devices)
            self.config.finetune = finetune
            self.vocabs.unlock()  # For extending vocabs
            logger.info(
                f'Finetune model loaded with {sum(p.numel() for p in self.model.parameters() if p.requires_grad)}'
                f'/{sum(p.numel() for p in self.model.parameters())} trainable/total parameters.')
        self.on_config_ready(**self.config, save_dir=save_dir)
        trn = self.build_dataloader(**merge_dict(config, data=trn_data, batch_size=batch_size, shuffle=True,
                                                 training=True, device=first_device, logger=logger, vocabs=self.vocabs,
                                                 overwrite=True))
        dev = self.build_dataloader(**merge_dict(config, data=dev_data, batch_size=batch_size, shuffle=False,
                                                 training=None, device=first_device, logger=logger, vocabs=self.vocabs,
                                                 overwrite=True)) if dev_data else None
        flash('[yellow]Building model [blink]...[/blink][/yellow]')
        self.model = self.build_model(**merge_dict(config, training=True), logger=logger)
        flash('')
        logger.info(f'Model built with {sum(p.numel() for p in self.model.parameters() if p.requires_grad)}'
                    f'/{sum(p.numel() for p in self.model.parameters())} trainable/total parameters.')
        assert self.model, 'build_model is not properly implemented.'
        _description = repr(self.model)
        if len(_description.split('\n')) < 10:
            logger.info(_description)
        self.save_config(save_dir)
        self.save_vocabs(save_dir)
        self.to(devices, logger)
        if _device_placeholder and first_device >= 0:
            del _dummy_placeholder
        criterion = self.build_criterion(**merge_dict(config, trn=trn))
        optimizer = self.build_optimizer(**merge_dict(config, trn=trn, criterion=criterion))
        metric = self.build_metric(**self.config)
        if hasattr(trn, 'dataset') and dev and hasattr(dev, 'dataset'):
            if trn.dataset and dev.dataset:
                logger.info(f'{len(trn.dataset)}/{len(dev.dataset)} samples in trn/dev set.')
        if hasattr(trn, '__len__') and dev and hasattr(dev, '__len__'):
            trn_size = len(trn) // self.config.get('gradient_accumulation', 1)
            ratio_width = len(f'{trn_size}/{trn_size}')
        else:
            ratio_width = None
        return self.execute_training_loop(**merge_dict(config, trn=trn, dev=dev, epochs=epochs, criterion=criterion,
                                                       optimizer=optimizer, metric=metric, logger=logger,
                                                       save_dir=save_dir,
                                                       devices=devices,
                                                       ratio_width=ratio_width,
                                                       trn_data=trn_data,
                                                       dev_data=dev_data,
                                                       eval_trn=eval_trn,
                                                       overwrite=True))

    def build_logger(self, name, save_dir):
        """Build a :class:`logging.Logger`.

        Args:
            name: The name of this logger.
            save_dir: The directory this logger should save logs into.

        Returns:
            logging.Logger: A logger.
        """
        logger = init_logger(name=name, root_dir=save_dir, level=logging.INFO, fmt="%(message)s")
        return logger

    @abstractmethod
    def build_dataloader(self, data, batch_size, shuffle=False, device=None, logger: logging.Logger = None,
                         **kwargs) -> DataLoader:
        """Build dataloader for training, dev and test sets. It's suggested to build vocabs in this method if they are
        not built yet.

        Args:
            data: Data representing samples, which can be a path or a list of samples.
            batch_size: Number of samples per batch.
            shuffle: Whether to shuffle this dataloader.
            device: Device tensors should be loaded onto.
            logger: Logger for reporting some message if dataloader takes a long time or if vocabs has to be built.
            **kwargs: Arguments from ``**self.config``.
        """
        pass

    def build_vocabs(self, trn: torch.utils.data.Dataset, logger: logging.Logger):
        """Override this method to build vocabs.

        Args:
            trn: Training set.
            logger: Logger for reporting progress.
        """
        pass

    @property
    def _savable_config(self):
        def convert(k, v):
            if not isinstance(v, SerializableDict) and hasattr(v, 'config'):
                v = v.config
            elif isinstance(v, (set, tuple)):
                v = list(v)
            if isinstance(v, dict):
                v = dict(convert(_k, _v) for _k, _v in v.items())
            return k, v

        config = SerializableDict(
            convert(k, v) for k, v in sorted(self.config.items()))
        config.update({
            # 'create_time': now_datetime(),
            'classpath': classpath_of(self),
            'hanlp_version': hanlp.__version__,
        })
        return config

    @abstractmethod
    def build_optimizer(self, **kwargs):
        """Implement this method to build an optimizer.

        Args:
            **kwargs: The subclass decides the method signature.
        """
        pass

    @abstractmethod
    def build_criterion(self, **kwargs):
        """Implement this method to build criterion (loss function).

        Args:
            **kwargs: The subclass decides the method signature.
        """
        pass

    @abstractmethod
    def build_metric(self, **kwargs):
        """Implement this to build metric(s).

        Args:
            **kwargs: The subclass decides the method signature.
        """
        pass

    @abstractmethod
    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
                              logger: logging.Logger, devices, ratio_width=None,
                              **kwargs):
        """Implement this to run training loop.

        Args:
            trn: Training set.
            dev: Development set.
            epochs: Number of epochs.
            criterion: Loss function.
            optimizer: Optimizer(s).
            metric: Metric(s)
            save_dir: The directory to save this component.
            logger: Logger for reporting progress.
            devices: Devices this component and dataloader will live on.
            ratio_width: The width of dataset size measured in number of characters. Used for logger to align messages.
            **kwargs: Other hyper-parameters passed from sub-class.
        """
        pass

    @abstractmethod
    def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
        """Fit onto a dataloader.

        Args:
            trn: Training set.
            criterion: Loss function.
            optimizer: Optimizer.
            metric: Metric(s).
            logger: Logger for reporting progress.
            **kwargs: Other hyper-parameters passed from sub-class.
        """
        pass

    @abstractmethod
    def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs):
        """Evaluate on a dataloader.

        Args:
            data: Dataloader which can build from any data source.
            criterion: Loss function.
            metric: Metric(s).
            output: Whether to save outputs into some file.
            **kwargs: Not used.
        """
        pass

    @abstractmethod
    def build_model(self, training=True, **kwargs) -> torch.nn.Module:
        """Build model.

        Args:
            training: ``True`` if called during training.
            **kwargs: ``**self.config``.
        """
        raise NotImplementedError

    def evaluate(self, tst_data, save_dir=None, logger: logging.Logger = None, batch_size=None, output=False, **kwargs):
        """Evaluate test set.

        Args:
            tst_data: Test set, which is usually a file path.
            save_dir: The directory to save evaluation scores or predictions.
            logger: Logger for reporting progress.
            batch_size: Batch size for test dataloader.
            output: Whether to save outputs into some file.
            **kwargs: Not used.

        Returns:
            (metric, outputs) where outputs are the return values of ``evaluate_dataloader``.
        """
        if not self.model:
            raise RuntimeError('Call fit or load before evaluate.')
        if isinstance(tst_data, str):
            tst_data = get_resource(tst_data)
            filename = os.path.basename(tst_data)
        else:
            filename = None
        if output is True:
            output = self.generate_prediction_filename(tst_data if isinstance(tst_data, str) else 'test.txt', save_dir)
        if logger is None:
            _logger_name = basename_no_ext(filename) if filename else None
            logger = self.build_logger(_logger_name, save_dir)
        if not batch_size:
            batch_size = self.config.get('batch_size', 32)
        data = self.build_dataloader(**merge_dict(self.config, data=tst_data, batch_size=batch_size, shuffle=False,
                                                  device=self.devices[0], logger=logger, overwrite=True))
        dataset = data
        while dataset and hasattr(dataset, 'dataset'):
            dataset = dataset.dataset
        num_samples = len(dataset) if dataset else None
        if output and isinstance(dataset, TransformableDataset):
            def add_idx(samples):
                for idx, sample in enumerate(samples):
                    if sample:
                        sample[IDX] = idx

            add_idx(dataset.data)
            if dataset.cache:
                add_idx(dataset.cache)

        criterion = self.build_criterion(**self.config)
        metric = self.build_metric(**self.config)
        start = time.time()
        outputs = self.evaluate_dataloader(data, criterion=criterion, filename=filename, output=output, input=tst_data,
                                           save_dir=save_dir,
                                           test=True,
                                           num_samples=num_samples,
                                           **merge_dict(self.config, batch_size=batch_size, metric=metric,
                                                        logger=logger, **kwargs))
        elapsed = time.time() - start
        if logger:
            if num_samples:
                logger.info(f'speed: {num_samples / elapsed:.0f} samples/second')
            else:
                logger.info(f'speed: {len(data) / elapsed:.0f} batches/second')
        return metric, outputs

    def generate_prediction_filename(self, tst_data, save_dir):
        assert isinstance(tst_data,
                          str), 'tst_data has be a str in order to infer the output name'
        output = os.path.splitext(os.path.basename(tst_data))
        output = os.path.join(save_dir, output[0] + '.pred' + output[1])
        return output

    def to(self,
           devices: Union[int, float, List[int], Dict[str, Union[int, torch.device]]] = None,
           logger: logging.Logger = None, verbose=HANLP_VERBOSE):
        """Move this component to devices.

        Args:
            devices: Target devices.
            logger: Logger for printing progress report, as copying a model from CPU to GPU can takes several seconds.
            verbose: ``True`` to print progress when logger is None.
        """
        if devices is None:
            # if getattr(torch, 'has_mps', None):  # mac M1 chips
            #     devices = torch.device('mps:0')
            # else:
            devices = cuda_devices(devices)
        elif devices == -1 or devices == [-1]:
            devices = []
        elif isinstance(devices, (int, float)):
            devices = cuda_devices(devices)
        if devices:
            if logger:
                logger.info(f'Using GPUs: [on_blue][cyan][bold]{devices}[/bold][/cyan][/on_blue]')
            if isinstance(devices, list):
                if verbose:
                    flash(f'Moving model to GPUs {devices} [blink][yellow]...[/yellow][/blink]')
                self.model = self.model.to(devices[0])
                if len(devices) > 1 and not isdebugging() and not isinstance(self.model, nn.DataParallel):
                    self.model = self.parallelize(devices)
            elif isinstance(devices, dict):
                for name, module in self.model.named_modules():
                    for regex, device in devices.items():
                        try:
                            on_device: torch.device = next(module.parameters()).device
                        except StopIteration:
                            continue
                        if on_device == device:
                            continue
                        if isinstance(device, int):
                            if on_device.index == device:
                                continue
                        if re.match(regex, name):
                            if not name:
                                name = '*'
                            flash(f'Moving module [yellow]{name}[/yellow] to [on_yellow][magenta][bold]{device}'
                                  f'[/bold][/magenta][/on_yellow]: [red]{regex}[/red]\n')
                            module.to(device)
            elif isinstance(devices, torch.device):
                if verbose:
                    flash(f'Moving model to {devices} [blink][yellow]...[/yellow][/blink]')
                self.model = self.model.to(devices)
            else:
                raise ValueError(f'Unrecognized devices {devices}')
            if verbose:
                flash('')
        else:
            if logger:
                logger.info('Using [red]CPU[/red]')

    def parallelize(self, devices: List[Union[int, torch.device]]):
        return nn.DataParallel(self.model, device_ids=devices)

    @property
    def devices(self):
        """The devices this component lives on.
        """
        if self.model is None:
            return None
        # next(parser.model.parameters()).device
        if hasattr(self.model, 'device_ids'):
            return self.model.device_ids
        device: torch.device = next(self.model.parameters()).device
        return [device]

    @property
    def device(self):
        """The first device this component lives on.
        """
        devices = self.devices
        if not devices:
            return None
        return devices[0]

    def on_config_ready(self, **kwargs):
        """Called when config is ready, either during ``fit`` or ``load``. Subclass can perform extra initialization
        tasks in this callback.

        Args:
            **kwargs: Not used.
        """
        pass

    @property
    def model_(self) -> nn.Module:
        """
        The actual model when it's wrapped by a `DataParallel`

        Returns: The "real" model

        """
        if isinstance(self.model, nn.DataParallel):
            return self.model.module
        return self.model

    # noinspection PyMethodOverriding
    @abstractmethod
    def predict(self, *args, **kwargs):
        """Predict on data fed by user. Users shall avoid directly call this method since it is not guarded with
        ``torch.no_grad`` and will introduces unnecessary gradient computation. Use ``__call__`` instead.

        Args:
            *args: Sentences or tokens.
            **kwargs: Used in sub-classes.
        """
        pass

    @staticmethod
    def _create_dummy_placeholder_on(device):
        if device < 0:
            device = 'cpu:0'
        return torch.zeros(16, 16, device=device)

    @torch.no_grad()
    def __call__(self, *args, **kwargs):
        """Predict on data fed by user. This method calls :meth:`~hanlp.common.torch_component.predict` but decorates
        it with ``torch.no_grad``.

        Args:
            *args: Sentences or tokens.
            **kwargs: Used in sub-classes.
        """
        return super().__call__(*args, **merge_dict(self.config, overwrite=True, **kwargs))


================================================
FILE: hanlp/common/transform.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-03 14:44
import logging
import os
from abc import ABC, abstractmethod
from typing import Tuple, Union, List

from hanlp_common.constant import EOS, PAD
from hanlp_common.structure import SerializableDict
from hanlp_common.configurable import Configurable
from hanlp.common.vocab import Vocab
from hanlp.utils.io_util import get_resource
from hanlp_common.io import load_json
from hanlp_common.reflection import classpath_of, str_to_type
from hanlp.utils.string_util import ispunct


class ToIndex(ABC):

    def __init__(self, vocab: Vocab = None) -> None:
        super().__init__()
        if vocab is None:
            vocab = Vocab()
        self.vocab = vocab

    @abstractmethod
    def __call__(self, sample):
        pass

    def save_vocab(self, save_dir, filename='vocab.json'):
        vocab = SerializableDict()
        vocab.update(self.vocab.to_dict())
        vocab.save_json(os.path.join(save_dir, filename))

    def load_vocab(self, save_dir, filename='vocab.json'):
        save_dir = get_resource(save_dir)
        vocab = SerializableDict()
        vocab.load_json(os.path.join(save_dir, filename))
        self.vocab.copy_from(vocab)


class FieldToIndex(ToIndex):

    def __init__(self, src, vocab: Vocab, dst=None) -> None:
        super().__init__(vocab)
        self.src = src
        if not dst:
            dst = f'{src}_id'
        self.dst = dst

    def __call__(self, sample: dict):
        sample[self.dst] = self.vocab(sample[self.src])
        return sample

    def save_vocab(self, save_dir, filename=None):
        if not filename:
            filename = f'{self.dst}_vocab.json'
        super().save_vocab(save_dir, filename)

    def load_vocab(self, save_dir, filename=None):
        if not filename:
            filename = f'{self.dst}_vocab.json'
        super().load_vocab(save_dir, filename)


class VocabList(list):

    def __init__(self, *fields) -> None:
        super().__init__()
        for each in fields:
            self.append(FieldToIndex(each))

    def append(self, item: Union[str, Tuple[str, Vocab], Tuple[str, str, Vocab], FieldToIndex]) -> None:
        if isinstance(item, str):
            item = FieldToIndex(item)
        elif isinstance(item, (list, tuple)):
            if len(item) == 2:
                item = FieldToIndex(src=item[0], vocab=item[1])
            elif len(item) == 3:
                item = FieldToIndex(src=item[0], dst=item[1], vocab=item[2])
            else:
                raise ValueError(f'Unsupported argument length: {item}')
        elif isinstance(item, FieldToIndex):
            pass
        else:
            raise ValueError(f'Unsupported argument type: {item}')
        super(self).append(item)

    def save_vocab(self, save_dir):
        for each in self:
            each.save_vocab(save_dir, None)

    def load_vocab(self, save_dir):
        for each in self:
            each.load_vocab(save_dir, None)


class VocabDict(SerializableDict):

    def __init__(self, *args, **kwargs) -> None:
        """A dict holding :class:`hanlp.common.vocab.Vocab` instances. When used as a transform, it transforms the field
        corresponding to each :class:`hanlp.common.vocab.Vocab` into indices.

        Args:
            *args: A list of vocab names.
            **kwargs: Names and corresponding :class:`hanlp.common.vocab.Vocab` instances.
        """
        vocabs = dict(kwargs)
        for each in args:
            vocabs[each] = Vocab()
        super().__init__(vocabs)

    def save_vocabs(self, save_dir, filename='vocabs.json'):
        """Save vocabularies to a directory.

        Args:
            save_dir: The directory to save vocabularies.
            filename:  The name for vocabularies.
        """
        vocabs = SerializableDict()
        for key, value in self.items():
            if isinstance(value, Vocab):
                vocabs[key] = value.to_dict()
        vocabs.save_json(os.path.join(save_dir, filename))

    def load_vocabs(self, save_dir, filename='vocabs.json', vocab_cls=Vocab):
        """Load vocabularies from a directory.

        Args:
            save_dir: The directory to load vocabularies.
            filename:  The name for vocabularies.
        """
        save_dir = get_resource(save_dir)
        vocabs = SerializableDict()
        vocabs.load_json(os.path.join(save_dir, filename))
        self._load_vocabs(self, vocabs, vocab_cls)

    @staticmethod
    def _load_vocabs(vd, vocabs: dict, vocab_cls=Vocab):
        """

        Args:
            vd:
            vocabs:
            vocab_cls: Default class for the new vocab
        """
        for key, value in vocabs.items():
            if 'idx_to_token' in value:
                cls = value.get('type', None)
                if cls:
                    cls = str_to_type(cls)
                else:
                    cls = vocab_cls
                vocab = cls()
                vocab.copy_from(value)
                vd[key] = vocab
            else:  # nested Vocab
                # noinspection PyTypeChecker
                vd[key] = nested = VocabDict()
                VocabDict._load_vocabs(nested, value, vocab_cls)

    def lock(self):
        """
        Lock each vocab.
        """
        for key, value in self.items():
            if isinstance(value, Vocab):
                value.lock()

    def unlock(self):
        """
        Unlock each vocab.
        """
        for key, value in self.items():
            if isinstance(value, Vocab):
                value.unlock()

    @property
    def mutable(self):
        status = [v.mutable for v in self.values() if isinstance(v, Vocab)]
        return len(status) == 0 or any(status)

    def __call__(self, sample: dict):
        for key, value in self.items():
            if isinstance(value, Vocab):
                field = sample.get(key, None)
                if field is not None:
                    sample[f'{key}_id'] = value(field)
        return sample

    def __getattr__(self, key):
        if key.startswith('__'):
            return dict.__getattr__(key)
        return self.__getitem__(key)

    def __setattr__(self, key, value):
        return self.__setitem__(key, value)

    def __getitem__(self, k: str) -> Vocab:
        return super().__getitem__(k)

    def __setitem__(self, k: str, v: Vocab) -> None:
        super().__setitem__(k, v)

    def summary(self, logger: logging.Logger = None):
        """Log a summary of vocabs using a given logger.

        Args:
            logger: The logger to use.
        """
        for key, value in self.items():
            if isinstance(value, Vocab):
                report = value.summary(verbose=False)
                if logger:
                    logger.info(f'{key}{report}')
                else:
                    print(f'{key}{report}')

    def put(self, **kwargs):
        """Put names and corresponding :class:`hanlp.common.vocab.Vocab` instances into self.

        Args:
            **kwargs: Names and corresponding :class:`hanlp.common.vocab.Vocab` instances.
        """
        for k, v in kwargs.items():
            self[k] = v


class NamedTransform(ABC):
    def __init__(self, src: str, dst: str = None) -> None:
        if dst is None:
            dst = src
        self.dst = dst
        self.src = src

    @abstractmethod
    def __call__(self, sample: dict) -> dict:
        return sample


class ConfigurableTransform(Configurable, ABC):
    @property
    def config(self):
        return dict([('classpath', classpath_of(self))] +
                    [(k, v) for k, v in self.__dict__.items() if not k.startswith('_')])

    @classmethod
    def from_config(cls, config: dict):
        """

        Args:
          config: 
          kwargs: 
          config: dict: 

        Returns:

        
        """
        cls = config.get('classpath', None)
        assert cls, f'{config} doesn\'t contain classpath field'
        cls = str_to_type(cls)
        config = dict(config)
        config.pop('classpath')
        return cls(**config)


class ConfigurableNamedTransform(NamedTransform, ConfigurableTransform, ABC):
    pass


class EmbeddingNamedTransform(ConfigurableNamedTransform, ABC):

    def __init__(self, output_dim: int, src: str, dst: str) -> None:
        super().__init__(src, dst)
        self.output_dim = output_dim


class RenameField(NamedTransform):

    def __call__(self, sample: dict):
        sample[self.dst] = sample.pop(self.src)
        return sample


class CopyField(object):
    def __init__(self, src, dst) -> None:
        self.dst = dst
        self.src = src

    def __call__(self, sample: dict) -> dict:
        sample[self.dst] = sample[self.src]
        return sample


class FilterField(object):
    def __init__(self, *keys) -> None:
        self.keys = keys

    def __call__(self, sample: dict):
        sample = dict((k, sample[k]) for k in self.keys)
        return sample


class TransformList(list):
    """Composes several transforms together.

    Args:
      transforms(list of ``Transform`` objects): list of transforms to compose.
    Example:

    Returns:

    >>> transforms.TransformList(
        >>>     transforms.CenterCrop(10),
        >>>     transforms.ToTensor(),
        >>> )
    """

    def __init__(self, *transforms) -> None:
        super().__init__()
        self.extend(transforms)

    def __call__(self, sample):
        for t in self:
            sample = t(sample)
        return sample

    def index_by_type(self, t):
        for i, trans in enumerate(self):
            if isinstance(trans, t):
                return i


class LowerCase(object):
    def __init__(self, src, dst=None) -> None:
        if dst is None:
            dst = src
        self.src = src
        self.dst = dst

    def __call__(self, sample: dict) -> dict:
        src = sample[self.src]
        if isinstance(src, str):
            sample[self.dst] = src.lower()
        elif isinstance(src, list):
            sample[self.dst] = [x.lower() for x in src]
        return sample


class LowerCase3D(LowerCase):

    def __call__(self, sample: dict) -> dict:
        src = sample[self.src]
        sample[self.dst] = [[y.lower() for y in x] for x in src]
        return sample


class ToChar(object):
    def __init__(self, src, dst='char', max_word_length=None, min_word_length=None, pad=PAD) -> None:
        if dst is None:
            dst = src
        self.src = src
        self.dst = dst
        self.max_word_length = max_word_length
        self.min_word_length = min_word_length
        self.pad = pad

    def __call__(self, sample: dict) -> dict:
        src = sample[self.src]
        if isinstance(src, str):
            sample[self.dst] = self.to_chars(src)
        elif isinstance(src, list):
            sample[self.dst] = [self.to_chars(x) for x in src]
        return sample

    def to_chars(self, word: str):
        chars = list(word)
        if self.min_word_length and len(chars) < self.min_word_length:
            chars = chars + [self.pad] * (self.min_word_length - len(chars))
        if self.max_word_length:
            chars = chars[:self.max_word_length]
        return chars


class AppendEOS(NamedTransform):

    def __init__(self, src: str, dst: str = None, eos=EOS) -> None:
        super().__init__(src, dst)
        self.eos = eos

    def __call__(self, sample: dict) -> dict:
        sample[self.dst] = sample[self.src] + [self.eos]
        return sample


class WhitespaceTokenizer(NamedTransform):

    def __call__(self, sample: dict) -> dict:
        src = sample[self.src]
        if isinstance(src, str):
            sample[self.dst] = self.tokenize(src)
        elif isinstance(src, list):
            sample[self.dst] = [self.tokenize(x) for x in src]
        return sample

    @staticmethod
    def tokenize(text: str):
        return text.split()


class NormalizeDigit(object):
    def __init__(self, src, dst=None) -> None:
        if dst is None:
            dst = src
        self.src = src
        self.dst = dst

    @staticmethod
    def transform(word: str):
        new_word = ""
        for char in word:
            if char.isdigit():
                new_word += '0'
            else:
                new_word += char
        return new_word

    def __call__(self, sample: dict) -> dict:
        src = sample[self.src]
        if isinstance(src, str):
            sample[self.dst] = self.transform(src)
        elif isinstance(src, list):
            sample[self.dst] = [self.transform(x) for x in src]
        return sample


class Bigram(NamedTransform):

    def __init__(self, src: str, dst: str = None) -> None:
        if not dst:
            dst = f'{src}_bigram'
        super().__init__(src, dst)

    def __call__(self, sample: dict) -> dict:
        src: List = sample[self.src]
        dst = src + [EOS]
        dst = [dst[i] + dst[i + 1] for i in range(len(src))]
        sample[self.dst] = dst
        return sample


class FieldLength(NamedTransform):

    def __init__(self, src: str, dst: str = None, delta=0) -> None:
        self.delta = delta
        if not dst:
            dst = f'{src}_length'
        super().__init__(src, dst)

    def __call__(self, sample: dict) -> dict:
        sample[self.dst] = len(sample[self.src]) + self.delta
        return sample


class BMESOtoIOBES(object):
    def __init__(self, field='tag') -> None:
        self.field = field

    def __call__(self, sample: dict) -> dict:
        sample[self.field] = [self.convert(y) for y in sample[self.field]]
        return sample

    @staticmethod
    def convert(y: str):
        if y.startswith('M-'):
            return 'I-'
        return y


class NormalizeToken(ConfigurableNamedTransform):

    def __init__(self, mapper: Union[str, dict], src: str, dst: str = None) -> None:
        super().__init__(src, dst)
        self.mapper = mapper
        if isinstance(mapper, str):
            mapper = get_resource(mapper)
        if isinstance(mapper, str):
            self._table = load_json(mapper)
        elif isinstance(mapper, dict):
            self._table = mapper
        else:
            raise ValueError(f'Unrecognized mapper type {mapper}')

    def __call__(self, sample: dict) -> dict:
        src = sample[self.src]
        if self.src == self.dst:
            sample[f'{self.src}_'] = src
        if isinstance(src, str):
            src = self.convert(src)
        else:
            src = [self.convert(x) for x in src]
        sample[self.dst] = src
        return sample

    def convert(self, token) -> str:
        return self._table.get(token, token)


class PunctuationMask(ConfigurableNamedTransform):
    def __init__(self, src: str, dst: str = None) -> None:
        """Mask out all punctuations (set mask of punctuations to False)

        Args:
          src:
          dst:

        Returns:

        """
        if not dst:
            dst = f'{src}_punct_mask'
        super().__init__(src, dst)

    def __call__(self, sample: dict) -> dict:
        src = sample[self.src]
        if isinstance(src, str):
            dst = not ispunct(src)
        else:
            dst = [not ispunct(x) for x in src]
        sample[self.dst] = dst
        return sample


class NormalizeCharacter(NormalizeToken):
    def convert(self, token) -> str:
        return ''.join([NormalizeToken.convert(self, c) for c in token])


================================================
FILE: hanlp/common/transform_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-27 14:22
import inspect
from abc import ABC, abstractmethod
from typing import Generator, Tuple, Union, Iterable, Any

import tensorflow as tf

from hanlp_common.structure import SerializableDict
from hanlp.common.vocab_tf import VocabTF
from hanlp.utils.io_util import get_resource
from hanlp.utils.log_util import logger


class Transform(ABC):

    def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, **kwargs) -> None:
        super().__init__()
        self.map_y = map_y
        self.map_x = map_x
        if kwargs:
            if not config:
                config = SerializableDict()
            for k, v in kwargs.items():
                config[k] = v
        self.config = config
        self.output_types = None
        self.output_shapes = None
        self.padding_values = None
        # Fix tf memory leak: https://github.com/tensorflow/tensorflow/issues/37653#issuecomment-1000517720
        self.py_func_set_to_cleanup = set()

    @abstractmethod
    def fit(self, trn_path: str, **kwargs) -> int:
        """
        Build the vocabulary from training file

        Parameters
        ----------
        trn_path : path to training set
        kwargs

        Returns
        -------
        int
            How many samples in the training set
        """
        raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3]))

    def build_config(self):
        """
        By default, call build_types_shapes_values, usually called in component's build method.
        You can perform other building task here. Remember to call super().build_config
        """
        self.output_types, self.output_shapes, self.padding_values = self.create_types_shapes_values()
        # We prefer list over shape here, as it's easier to type [] than ()
        # if isinstance(self.output_shapes, tuple):
        #     self.output_shapes = list(self.output_shapes)
        # for i, shapes in enumerate(self.output_shapes):
        #     if isinstance(shapes, tuple):
        #         self.output_shapes[i] = list(shapes)
        #     for j, shape in enumerate(shapes):
        #         if isinstance(shape, tuple):
        #             shapes[j] = list(shape)

    @abstractmethod
    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        """
        Create dataset related values,
        """
        raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3]))

    @abstractmethod
    def file_to_inputs(self, filepath: str, gold=True):
        """
        Transform file to inputs. The inputs are defined as raw features (e.g. words) to be processed into more
        features (e.g. forms and characters)

        Parameters
        ----------
        filepath
        gold
        """
        raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3]))

    def inputs_to_samples(self, inputs, gold=False):
        if gold:
            yield from inputs
        else:
            for x in inputs:
                yield x, self.padding_values[-1]

    def file_to_samples(self, filepath: str, gold=True):
        """
        Transform file to samples
        Parameters
        ----------
        filepath
        gold
        """
        filepath = get_resource(filepath)
        inputs = self.file_to_inputs(filepath, gold)
        yield from self.inputs_to_samples(inputs, gold)

    def file_to_dataset(self, filepath: str, gold=True, map_x=None, map_y=None, batch_size=32, shuffle=None,
                        repeat=None,
                        drop_remainder=False,
                        prefetch=1,
                        cache=True,
                        **kwargs) -> tf.data.Dataset:
        """
        Transform file to dataset

        Parameters
        ----------
        filepath
        gold : bool
            Whether it's processing gold data or not. Example: there is usually a column for gold answer
            when gold = True.
        map_x : bool
            Whether call map_x or not. Default to self.map_x
        map_y : bool
            Whether call map_y or not. Default to self.map_y
        batch_size
        shuffle
        repeat
        prefetch
        kwargs

        Returns
        -------

        """

        # debug
        # for sample in self.file_to_samples(filepath):
        #     pass

        def generator():
            inputs = self.file_to_inputs(filepath, gold)
            samples = self.inputs_to_samples(inputs, gold)
            yield from samples

        return self.samples_to_dataset(generator, map_x, map_y, batch_size, shuffle, repeat, drop_remainder, prefetch,
                                       cache)

    def inputs_to_dataset(self, inputs, gold=False, map_x=None, map_y=None, batch_size=32, shuffle=None, repeat=None,
                          drop_remainder=False,
                          prefetch=1, cache=False, **kwargs) -> tf.data.Dataset:
        # debug
        # for sample in self.inputs_to_samples(inputs):
        #     pass

        def generator():
            samples = self.inputs_to_samples(inputs, gold)
            yield from samples

        return self.samples_to_dataset(generator, map_x, map_y, batch_size, shuffle, repeat, drop_remainder, prefetch,
                                       cache)

    def samples_to_dataset(self, samples: Generator, map_x=None, map_y=None, batch_size=32, shuffle=None, repeat=None,
                           drop_remainder=False,
                           prefetch=1, cache=True) -> tf.data.Dataset:
        output_types, output_shapes, padding_values = self.output_types, self.output_shapes, self.padding_values
        if not all(v for v in [output_shapes, output_shapes,
                               padding_values]):
            # print('Did you forget to call build_config() on your transform?')
            self.build_config()
            output_types, output_shapes, padding_values = self.output_types, self.output_shapes, self.padding_values
        assert all(v for v in [output_shapes, output_shapes,
                               padding_values]), 'Your create_types_shapes_values returns None, which is not allowed'
        # if not callable(samples):
        #     samples = Transform.generator_to_callable(samples)
        if not hasattr(tf.compat.v1.get_default_graph(), '_py_funcs_used_in_graph'):
            tf.compat.v1.get_default_graph()._py_funcs_used_in_graph = []
        py_func_set_before = set(tf.compat.v1.get_default_graph()._py_funcs_used_in_graph)
        dataset = tf.data.Dataset.from_generator(samples, output_types=output_types, output_shapes=output_shapes)
        if cache:
            logger.debug('Dataset cache enabled')
            dataset = dataset.cache(cache if isinstance(cache, str) else '')
        if shuffle:
            if isinstance(shuffle, bool):
                shuffle = 1024
            dataset = dataset.shuffle(shuffle)
        if repeat:
            dataset = dataset.repeat(repeat)
        if batch_size:
            dataset = dataset.padded_batch(batch_size, output_shapes, padding_values, drop_remainder)
        if prefetch:
            dataset = dataset.prefetch(prefetch)
        if map_x is None:
            map_x = self.map_x
        if map_y is None:
            map_y = self.map_y
        if map_x or map_y:
            def mapper(X, Y):
                if map_x:
                    X = self.x_to_idx(X)
                if map_y:
                    Y = self.y_to_idx(Y)
                return X, Y

            dataset = dataset.map(mapper, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        py_func_set_after = set(tf.compat.v1.get_default_graph()._py_funcs_used_in_graph) - py_func_set_before
        self.py_func_set_to_cleanup |= py_func_set_after
        return dataset

    @abstractmethod
    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3]))

    @abstractmethod
    def y_to_idx(self, y) -> tf.Tensor:
        raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3]))

    def lock_vocabs(self):
        for key, value in vars(self).items():
            if isinstance(value, VocabTF):
                value.lock()

    def summarize_vocabs(self, logger=None, header='Vocab summary:'):
        output = header + '\n'
        vocabs = {}
        for key, value in vars(self).items():
            if isinstance(value, VocabTF):
                vocabs[key] = value
        # tag vocab comes last usually
        for key, value in sorted(vocabs.items(), key=lambda kv: len(kv[1]), reverse=True):
            output += f'{key}' + value.summary(verbose=False) + '\n'
        output = output.strip()
        if logger:
            logger.info(output)
        else:
            print(output)

    @staticmethod
    def generator_to_callable(generator: Generator):
        return lambda: (x for x in generator)

    def str_to_idx(self, X, Y) -> Tuple[Union[tf.Tensor, Tuple], tf.Tensor]:
        return self.x_to_idx(X), self.y_to_idx(Y)

    def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
        return [repr(x) for x in X]

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None,
                     batch=None) -> Iterable:
        return [repr(y) for y in Y]

    def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]],
                             Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False) -> Iterable:
        """
        Convert predicted tensors to outputs

        Parameters
        ----------
        X : Union[tf.Tensor, Tuple[tf.Tensor]]
            The inputs of model
        Y : Union[tf.Tensor, Tuple[tf.Tensor]]
            The outputs of model

        Returns
        -------

        """
        return [(x, y) for x, y in zip(self.X_to_inputs(X), self.Y_to_outputs(Y, gold))]

    def input_is_single_sample(self, input: Any) -> bool:
        return False

    def input_to_inputs(self, input: Any) -> Tuple[Any, bool]:
        """
        If input is one sample, convert it to a list which contains this unique sample

        Parameters
        ----------
        input :
            sample or samples

        Returns
        -------
        (inputs, converted) : Tuple[Any, bool]

        """
        flat = self.input_is_single_sample(input)
        if flat:
            input = [input]
        return input, flat

    def input_truth_output_to_str(self, input, truth, output):
        """
        Convert input truth output to string representation, usually for writing to file during evaluation

        Parameters
        ----------
        input
        truth
        output

        Returns
        -------

        """
        return '\t'.join([input, truth, output]) + '\n'

    def cleanup(self):
        new_py_funcs = set(tf.compat.v1.get_default_graph()._py_funcs_used_in_graph) - self.py_func_set_to_cleanup
        tf.compat.v1.get_default_graph()._py_funcs_used_in_graph = list(new_py_funcs)
        self.py_func_set_to_cleanup = set()


================================================
FILE: hanlp/common/vocab.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-06-13 22:42
from collections import Counter
from typing import List, Dict, Union, Iterable

from hanlp_common.constant import UNK, PAD
from hanlp_common.structure import Serializable
from hanlp_common.reflection import classpath_of


class Vocab(Serializable):
    def __init__(self, idx_to_token: List[str] = None, token_to_idx: Dict = None, mutable=True, pad_token=PAD,
                 unk_token=UNK) -> None:
        """Vocabulary base class which converts tokens to indices and vice versa.

        Args:
            idx_to_token: id to token mapping.
            token_to_idx: token to id mapping.
            mutable: ``True`` to allow adding new tokens, ``False`` to map OOV to ``unk``.
            pad_token: The token representing padding.
            unk_token: The token representing OOV.
        """
        super().__init__()
        if idx_to_token:
            t2i = dict((token, idx) for idx, token in enumerate(idx_to_token))
            if token_to_idx:
                t2i.update(token_to_idx)
            token_to_idx = t2i
        if token_to_idx is None:
            token_to_idx = {}
            if pad_token is not None:
                token_to_idx[pad_token] = len(token_to_idx)
            if unk_token is not None:
                token_to_idx[unk_token] = token_to_idx.get(unk_token, len(token_to_idx))
        self.token_to_idx = token_to_idx
        self.idx_to_token: List[str] = None
        self.mutable = mutable
        self.pad_token = pad_token
        self.unk_token = unk_token

    def __setitem__(self, token: str, idx: int):
        assert self.mutable, 'Update an immutable Vocab object is not allowed'
        self.token_to_idx[token] = idx

    def __getitem__(self, key: Union[str, int, List]) -> Union[int, str, List]:
        """ Get the index/indices associated with a token or a list of tokens or vice versa.

        Args:
            key: ``str`` for token(s) and ``int`` for index/indices.

        Returns: Associated indices or tokens.

        """
        if isinstance(key, str):
            return self.get_idx(key)
        elif isinstance(key, int):
            return self.get_token(key)
        elif isinstance(key, list):
            if len(key) == 0:
                return []
            elif isinstance(key[0], str):
                return [self.get_idx(x) for x in key]
            elif isinstance(key[0], int):
                return [self.get_token(x) for x in key]

    def __contains__(self, key: Union[str, int]):
        if isinstance(key, str):
            return key in self.token_to_idx
        elif isinstance(key, int):
            return 0 <= key < len(self.idx_to_token)
        else:
            return False

    def add(self, token: str) -> int:
        """ Tries to add a token into a vocab and returns its id. If it has already been there, its id will be returned
        and the vocab won't be updated. If the vocab is locked, an assertion failure will occur.

        Args:
            token: A new or existing token.

        Returns:
            Its associated id.

        """
        assert self.mutable, 'It is not allowed to call add on an immutable Vocab'
        assert isinstance(token, str), f'Token type must be str but got {type(token)} from {token}'
        assert token is not None, 'Token must not be None'
        idx = self.token_to_idx.get(token, None)
        if idx is None:
            idx = len(self.token_to_idx)
            self.token_to_idx[token] = idx
        return idx

    def update(self, tokens: Iterable[str]) -> None:
        """Update the vocab with these tokens by adding them to vocab one by one.

        Args:
          tokens (Iterable[str]): A list of tokens.
        """
        assert self.mutable, 'It is not allowed to update an immutable Vocab'
        for token in tokens:
            self.add(token)

    def get_idx(self, token: str) -> int:
        """Get the idx of a token. If it's not there, it will be added to the vocab when the vocab is locked otherwise
        the id of UNK will be returned.

        Args:
            token: A token.

        Returns:
            The id of that token.

        """
        assert isinstance(token, str), 'token has to be `str`'
        idx = self.token_to_idx.get(token, None)
        if idx is None:
            if self.mutable:
                idx = len(self.token_to_idx)
                self.token_to_idx[token] = idx
            else:
                idx = self.token_to_idx.get(self.unk_token, None)
        return idx

    def get_idx_without_add(self, token: str) -> int:
        idx = self.token_to_idx.get(token, None)
        if idx is None:
            idx = self.token_to_idx.get(self.safe_unk_token, None)
        return idx

    def get_token(self, idx: int) -> str:
        """Get the token using its index.

        Args:
            idx: The index to a token.

        Returns:

        """
        if self.idx_to_token:
            return self.idx_to_token[idx]

        if self.mutable:
            for token in self.token_to_idx:
                if self.token_to_idx[token] == idx:
                    return token

    def has_key(self, token):
        return token in self.token_to_idx

    def __len__(self):
        return len(self.token_to_idx)

    def lock(self):
        """Lock this vocab up so that it won't accept new tokens.

        Returns:
            Itself.

        """
        if self.locked:
            return self
        self.mutable = False
        self.build_idx_to_token()
        return self

    def build_idx_to_token(self):
        max_idx = max(self.token_to_idx.values())
        self.idx_to_token = [None] * (max_idx + 1)
        for token, idx in self.token_to_idx.items():
            self.idx_to_token[idx] = token

    def unlock(self):
        """Unlock this vocab so that new tokens can be added in.

        Returns:
            Itself.

        """
        if not self.locked:
            return
        self.mutable = True
        self.idx_to_token = None
        return self

    @property
    def locked(self):
        """
        ``True`` indicates this vocab is locked.
        """
        return not self.mutable

    @property
    def unk_idx(self):
        """
        The index of ``UNK`` token.
        """
        if self.unk_token is None:
            return None
        else:
            return self.token_to_idx.get(self.unk_token, None)

    @property
    def pad_idx(self):
        """
        The index of ``PAD`` token.
        """
        if self.pad_token is None:
            return None
        else:
            return self.token_to_idx.get(self.pad_token, None)

    @property
    def tokens(self):
        """
        A set of all tokens in this vocab.
        """
        return self.token_to_idx.keys()

    def __str__(self) -> str:
        return self.token_to_idx.__str__()

    def summary(self, verbose=True) -> str:
        """Get or print a summary of this vocab.

        Args:
            verbose: ``True`` to print the summary to stdout.

        Returns:
            Summary in text form.

        """
        # report = 'Length: {}\n'.format(len(self))
        # report += 'Samples: {}\n'.format(str(list(self.token_to_idx.keys())[:min(50, len(self))]))
        # report += 'Mutable: {}'.format(self.mutable)
        # report = report.strip()
        report = '[{}] = '.format(len(self))
        report += str(list(self.token_to_idx.keys())[:min(50, len(self))])
        if verbose:
            print(report)
        return report

    def __call__(self, some_token: Union[str, Iterable[str]]) -> Union[int, List[int]]:
        if isinstance(some_token, (list, tuple, set)):
            indices = []
            if len(some_token) and isinstance(some_token[0], (list, tuple, set)):
                for sent in some_token:
                    inside = []
                    for token in sent:
                        inside.append(self.get_idx(token))
                    indices.append(inside)
                return indices
            for token in some_token:
                indices.append(self.get_idx(token))
            return indices
        else:
            return self.get_idx(some_token)

    def to_dict(self) -> dict:
        """Convert this vocab to a dict so that it can be json serialized.

        Returns:
            A dict.

        """
        idx_to_token = self.idx_to_token
        pad_token = self.pad_token
        unk_token = self.unk_token
        mutable = self.mutable
        items = locals().copy()
        items.pop('self')
        return items

    def copy_from(self, item: dict):
        """Copy properties from a dict so that it can json de-serialized.

        Args:
            item: A dict holding ``token_to_idx``

        Returns:
            Itself.

        """
        for key, value in item.items():
            setattr(self, key, value)
        self.token_to_idx = {k: v for v, k in enumerate(self.idx_to_token)}
        return self

    def lower(self):
        """Convert all tokens to lower case.

        Returns:
            Itself.

        """
        self.unlock()
        token_to_idx = self.token_to_idx
        self.token_to_idx = {}
        for token in token_to_idx.keys():
            self.add(token.lower())
        return self

    @property
    def first_token(self):
        """The first token in this vocab.
        """
        if self.idx_to_token:
            return self.idx_to_token[0]
        if self.token_to_idx:
            return next(iter(self.token_to_idx))
        return None

    def merge(self, other):
        """Merge this with another vocab inplace.

        Args:
            other (Vocab): Another vocab.
        """
        for word, idx in other.token_to_idx.items():
            self.get_idx(word)

    @property
    def safe_pad_token(self) -> str:
        """Get the pad token safely. It always returns a pad token, which is the pad token or the first token
        if pad does not present in the vocab.
        """
        if self.pad_token:
            return self.pad_token
        if self.first_token:
            return self.first_token
        return PAD

    @property
    def safe_pad_token_idx(self) -> int:
        """Get the idx to the pad token safely. It always returns an index, which corresponds to the pad token or the
        first token if pad does not present in the vocab.
        """
        return self.token_to_idx.get(self.safe_pad_token, 0)

    @property
    def safe_unk_token(self) -> str:
        """Get the unk token safely. It always returns a unk token, which is the unk token or the first token if unk
        does not presented in the vocab.
        """
        if self.unk_token:
            return self.unk_token
        if self.first_token:
            return self.first_token
        return UNK

    def __repr__(self) -> str:
        if self.idx_to_token is not None:
            return self.idx_to_token.__repr__()
        return self.token_to_idx.__repr__()

    def extend(self, tokens: Iterable[str]):
        self.unlock()
        self(tokens)

    def reload_idx_to_token(self, idx_to_token: List[str], pad_idx=0, unk_idx=1):
        self.idx_to_token = idx_to_token
        self.token_to_idx = dict((s, i) for i, s in enumerate(idx_to_token))
        if pad_idx is not None:
            self.pad_token = idx_to_token[pad_idx]
        if unk_idx is not None:
            self.unk_token = idx_to_token[unk_idx]

    def set_unk_as_safe_unk(self):
        """Set ``self.unk_token = self.safe_unk_token``. It's useful when the dev/test set contains OOV labels.
        """
        self.unk_token = self.safe_unk_token

    def clear(self):
        self.unlock()
        self.token_to_idx.clear()


class CustomVocab(Vocab):
    def to_dict(self) -> dict:
        d = super().to_dict()
        d['type'] = classpath_of(self)
        return d


class LowercaseVocab(CustomVocab):
    def get_idx(self, token: str) -> int:
        idx = self.token_to_idx.get(token, None)
        if idx is None:
            idx = self.token_to_idx.get(token.lower(), None)
        if idx is None:
            if self.mutable:
                idx = len(self.token_to_idx)
                self.token_to_idx[token] = idx
            else:
                idx = self.token_to_idx.get(self.unk_token, None)
        return idx


class VocabWithNone(CustomVocab):
    def get_idx(self, token: str) -> int:
        if token is None:
            return -1
        return super().get_idx(token)


class VocabWithFrequency(CustomVocab):

    def __init__(self, counter: Counter = None, min_occur_cnt=0, pad_token=PAD, unk_token=UNK, specials=None) -> None:
        super().__init__(None, None, True, pad_token, unk_token)
        if specials:
            for each in specials:
                counter.pop(each, None)
                self.add(each)
        self.frequencies = [1] * len(self)
        if counter:
            for token, freq in counter.most_common():
                if freq >= min_occur_cnt:
                    self.add(token)
                    self.frequencies.append(freq)
        self.lock()

    def to_dict(self) -> dict:
        d = super().to_dict()
        d['frequencies'] = self.frequencies
        return d

    def copy_from(self, item: dict):
        super().copy_from(item)
        self.frequencies = item['frequencies']

    def get_frequency(self, token):
        idx = self.get_idx(token)
        if idx is not None:
            return self.frequencies[idx]
        return 0


class VocabCounter(CustomVocab):

    def __init__(self, idx_to_token: List[str] = None, token_to_idx: Dict = None, mutable=True, pad_token=PAD,
                 unk_token=UNK) -> None:
        super().__init__(idx_to_token, token_to_idx, mutable, pad_token, unk_token)
        self.counter = Counter()

    def get_idx(self, token: str) -> int:
        if self.mutable:
            self.counter[token] += 1
        return super().get_idx(token)

    def trim(self, min_frequency):
        assert self.mutable
        specials = {self.unk_token, self.pad_token}
        survivors = list((token, freq) for token, freq in self.counter.most_common()
                         if freq >= min_frequency and token not in specials)
        survivors = [(x, -1) for x in specials if x] + survivors
        self.counter = Counter(dict(survivors))
        self.token_to_idx = dict()
        self.idx_to_token = None
        for token, freq in survivors:
            idx = len(self.token_to_idx)
            self.token_to_idx[token] = idx

    def copy_from(self, item: dict):
        super().copy_from(item)
        self.counter = Counter(item['counter'].items()) if 'counter' in item else Counter()

    def to_dict(self) -> dict:
        d = super().to_dict()
        d['counter'] = dict(self.counter.items())
        return d


class Vocab3D(CustomVocab):
    def __call__(self, some_token: Union[str, Iterable[str], Iterable[Iterable[str]]]) \
            -> Union[int, List[int], List[List[int]]]:
        """It supports 3D arrays of tokens.

        Args:
            some_token: Tokens of 1D to 3D

        Returns:
            A list of indices.

        """
        if isinstance(some_token, (list, tuple, set)):
            indices = []
            if len(some_token) and isinstance(some_token[0], (list, tuple, set)):
                for sent in some_token:
                    inside = []
                    for token in sent:
                        inside.append(self.get_idx(token))
                    indices.append(inside)
                return indices
            for token in some_token:
                if isinstance(token, str):
                    indices.append(self.get_idx(token))
                else:
                    indices.append([self.get_idx(x) for x in token])
            return indices
        else:
            return self.get_idx(some_token)


def create_label_vocab() -> Vocab:
    return Vocab(pad_token=None, unk_token=None)


================================================
FILE: hanlp/common/vocab_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-06-13 22:42
from typing import List, Dict, Union, Iterable

from hanlp_common.structure import Serializable
from hanlp_common.constant import PAD, UNK
import tensorflow as tf
from tensorflow.python.ops.lookup_ops import index_table_from_tensor


class VocabTF(Serializable):
    def __init__(self, idx_to_token: List[str] = None, token_to_idx: Dict = None, mutable=True, pad_token=PAD,
                 unk_token=UNK) -> None:
        super().__init__()
        if idx_to_token:
            t2i = dict((token, idx) for idx, token in enumerate(idx_to_token))
            if token_to_idx:
                t2i.update(token_to_idx)
            token_to_idx = t2i
        if token_to_idx is None:
            token_to_idx = {}
            if pad_token:
                token_to_idx[pad_token] = len(token_to_idx)
            if unk_token:
                token_to_idx[unk_token] = len(token_to_idx)
        self.token_to_idx = token_to_idx
        self.idx_to_token: list = None
        self.mutable = mutable
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.token_to_idx_table: tf.lookup.StaticHashTable = None
        self.idx_to_token_table = None

    def __setitem__(self, token: str, idx: int):
        assert self.mutable, 'Update an immutable Vocab object is not allowed'
        self.token_to_idx[token] = idx

    def __getitem__(self, key: Union[str, int, List]) -> Union[int, str, List]:
        if isinstance(key, str):
            return self.get_idx(key)
        elif isinstance(key, int):
            return self.get_token(key)
        elif isinstance(key, list):
            if len(key) == 0:
                return []
            elif isinstance(key[0], str):
                return [self.get_idx(x) for x in key]
            elif isinstance(key[0], int):
                return [self.get_token(x) for x in key]

    def __contains__(self, key: Union[str, int]):
        if isinstance(key, str):
            return key in self.token_to_idx
        elif isinstance(key, int):
            return 0 <= key < len(self.idx_to_token)
        else:
            return False

    def add(self, token: str) -> int:
        assert self.mutable, 'It is not allowed to call add on an immutable Vocab'
        assert isinstance(token, str), f'Token type must be str but got {type(token)} from {token}'
        assert token, 'Token must not be None or length 0'
        idx = self.token_to_idx.get(token, None)
        if idx is None:
            idx = len(self.token_to_idx)
            self.token_to_idx[token] = idx
        return idx

    def update(self, tokens: Iterable[str]) -> None:
        """Update the vocab with these tokens by adding them to vocab one by one.

        Args:
          tokens: Iterable[str]: 

        Returns:

        
        """
        assert self.mutable, 'It is not allowed to update an immutable Vocab'
        for token in tokens:
            self.add(token)

    def get_idx(self, token: str) -> int:
        idx = self.token_to_idx.get(token, None)
        if idx is None:
            if self.mutable:
                idx = len(self.token_to_idx)
                self.token_to_idx[token] = idx
            else:
                idx = self.token_to_idx.get(self.unk_token, None)
        return idx

    def get_idx_without_add(self, token: str) -> int:
        idx = self.token_to_idx.get(token, None)
        if idx is None:
            idx = self.token_to_idx.get(self.safe_unk_token, None)
        return idx

    def get_token(self, idx: int) -> str:
        if self.idx_to_token:
            return self.idx_to_token[idx]

        if self.mutable:
            for token in self.token_to_idx:
                if self.token_to_idx[token] == idx:
                    return token

    def has_key(self, token):
        return token in self.token_to_idx

    def __len__(self):
        return len(self.token_to_idx)

    def lock(self):
        if self.locked:
            return self
        self.mutable = False
        self.build_idx_to_token()
        self.build_lookup_table()
        return self

    def build_idx_to_token(self):
        max_idx = max(self.token_to_idx.values())
        self.idx_to_token = [None] * (max_idx + 1)
        for token, idx in self.token_to_idx.items():
            self.idx_to_token[idx] = token

    def build_lookup_table(self):
        tensor = tf.constant(self.idx_to_token, dtype=tf.string)
        self.token_to_idx_table = index_table_from_tensor(tensor, num_oov_buckets=1 if self.unk_idx is None else 0,
                                                          default_value=-1 if self.unk_idx is None else self.unk_idx)
        # self.idx_to_token_table = index_to_string_table_from_tensor(self.idx_to_token, self.safe_unk_token)

    def unlock(self):
        if not self.locked:
            return
        self.mutable = True
        self.idx_to_token = None
        self.idx_to_token_table = None
        self.token_to_idx_table = None
        return self

    @property
    def locked(self):
        return not self.mutable

    @property
    def unk_idx(self):
        if self.unk_token is None:
            return None
        else:
            return self.token_to_idx.get(self.unk_token, None)

    @property
    def pad_idx(self):
        if self.pad_token is None:
            return None
        else:
            return self.token_to_idx.get(self.pad_token, None)

    @property
    def tokens(self):
        return self.token_to_idx.keys()

    def __str__(self) -> str:
        return self.token_to_idx.__str__()

    def summary(self, verbose=True) -> str:
        # report = 'Length: {}\n'.format(len(self))
        # report += 'Samples: {}\n'.format(str(list(self.token_to_idx.keys())[:min(50, len(self))]))
        # report += 'Mutable: {}'.format(self.mutable)
        # report = report.strip()
        report = '[{}] = '.format(len(self))
        report += str(list(self.token_to_idx.keys())[:min(50, len(self))])
        if verbose:
            print(report)
        return report

    def __call__(self, some_token: Union[str, List[str]]) -> Union[int, List[int]]:
        if isinstance(some_token, list):
            indices = []
            for token in some_token:
                indices.append(self.get_idx(token))
            return indices
        else:
            return self.get_idx(some_token)

    def lookup(self, token_tensor: tf.Tensor) -> tf.Tensor:
        if self.mutable:
            self.lock()
        return self.token_to_idx_table.lookup(token_tensor)

    def to_dict(self) -> dict:
        idx_to_token = self.idx_to_token
        pad_token = self.pad_token
        unk_token = self.unk_token
        mutable = self.mutable
        items = locals().copy()
        items.pop('self')
        return items

    def copy_from(self, item: dict):
        for key, value in item.items():
            setattr(self, key, value)
        self.token_to_idx = {k: v for v, k in enumerate(self.idx_to_token)}
        if not self.mutable:
            self.build_lookup_table()

    def lower(self):
        self.unlock()
        token_to_idx = self.token_to_idx
        self.token_to_idx = {}
        for token in token_to_idx.keys():
            self.add(token.lower())
        return self

    @property
    def first_token(self):
        if self.idx_to_token:
            return self.idx_to_token[0]
        if self.token_to_idx:
            return next(iter(self.token_to_idx))
        return None

    def merge(self, other):
        for word, idx in other.token_to_idx.items():
            self.get_idx(word)

    @property
    def safe_pad_token(self) -> str:
        """Get the pad token safely. It always returns a pad token, which is the token
        closest to pad if not presented in the vocab.

        Args:

        Returns:

        
        """
        if self.pad_token:
            return self.pad_token
        if self.first_token:
            return self.first_token
        return PAD

    @property
    def safe_pad_token_idx(self) -> int:
        return self.token_to_idx.get(self.safe_pad_token, 0)

    @property
    def safe_unk_token(self) -> str:
        """Get the unk token safely. It always returns a unk token, which is the token
        closest to unk if not presented in the vocab.

        Args:

        Returns:

        
        """
        if self.unk_token:
            return self.unk_token
        if self.first_token:
            return self.first_token
        return UNK


def create_label_vocab() -> VocabTF:
    return VocabTF(pad_token=None, unk_token=None)


================================================
FILE: hanlp/components/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-26 16:10
from .pipeline import Pipeline

================================================
FILE: hanlp/components/amr/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-20 17:35


================================================
FILE: hanlp/components/amr/amrbart/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-12-05 17:53


================================================
FILE: hanlp/components/amr/amrbart/bart_amr_generation.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-12-05 17:56
import logging
import os.path
from typing import Callable, Union, List

import penman
import torch
from torch.utils.data import DataLoader

from hanlp.components.amr.amrbart.data_interface.dataset import AMR2TextDataSet
from hanlp.common.dataset import SortingSamplerBuilder, PadSequenceDataLoader
from hanlp.common.torch_component import TorchComponent
from hanlp.components.amr.seq2seq.dataset.dataset import AMRDataset
from hanlp.layers.transformers.pt_imports import AutoConfig_
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.constant import IDX
from hanlp_common.util import reorder
from hanlp.components.amr.amrbart.model_interface.modeling_bart import BartForConditionalGeneration
from hanlp.components.amr.amrbart.model_interface.tokenization_bart import AMRBartTokenizer
from hanlp.components.amr.amrbart.preprocess.read_and_process import dfs_linearize


class BART_AMR_Generation(TorchComponent):
    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
        self.tokenizer: AMRBartTokenizer = None
        self.transformer_config = None
        self.model: BartForConditionalGeneration = None

    def build_dataloader(self, data, batch_size=32, shuffle=False, device=None, logger: logging.Logger = None,
                         sampler_builder=None,
                         **kwargs) -> DataLoader:
        dataset = AMRDataset(data, generate_idx=True, cache=True)
        dataset.append_transform(lambda x: {**x, 'lamr': ' '.join(dfs_linearize(x['amr']))})
        dataset.append_transform(
            lambda x: AMR2TextDataSet.tokenize(x, tokenizer=self.tokenizer, text='text', amr='lamr')
        )
        if not sampler_builder:
            sampler_builder = SortingSamplerBuilder(batch_max_tokens=500)
        sampler = sampler_builder.build([len(x['input_ids']) for x in dataset], shuffle, 1)
        return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler,
                                     pad={'input_ids': self.transformer_config.pad_token_id,
                                          'labels': self.transformer_config.pad_token_id})

    def build_optimizer(self, **kwargs):
        pass

    def build_criterion(self, **kwargs):
        pass

    def build_metric(self, **kwargs):
        pass

    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
                              logger: logging.Logger, devices, ratio_width=None, **kwargs):
        pass

    def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
        pass

    def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs):
        pass

    def build_model(self, training=True, transformer=None, **kwargs) -> torch.nn.Module:
        model = BartForConditionalGeneration.from_pretrained(
            transformer,
            config=self.transformer_config,
        )
        if not training:
            model.eval()
        model.resize_token_embeddings(len(self.tokenizer))
        return model

    def input_is_flat(self, data):
        return isinstance(data, (str, penman.Graph))

    def predict(
            self,
            data: Union[str, List[str]], num_beams=5, max_length=1024, beautiful_amr_graph=True, verbose=False,
            **kwargs
    ):
        flat = self.input_is_flat(data)
        if flat:
            data = [data]
        dataloader = self.build_dataloader([{'amr': penman.loads(x)[0] if isinstance(x, str) else x} for x in data],
                                           **self.config, device=self.device)
        orders = []
        results = []
        if verbose:
            timer = CountdownTimer(len(dataloader))
        for batch in dataloader:
            pieces = self.predict_batch(batch, num_beams, max_length)
            results.extend(pieces)
            orders.extend(batch[IDX])
            if verbose:
                # noinspection PyUnboundLocalVariable
                timer.log()
        results = reorder(results, orders)
        if flat:
            results = results[0]
        return results

    def predict_batch(self, batch, num_beams, max_length):
        tokenizer = self.tokenizer
        input_ids = batch['input_ids']
        preds = self.model.generate(
            input_ids,
            num_beams=num_beams,
            use_cache=True,
            decoder_start_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=0,
            max_length=max_length,
            min_length=0,
            length_penalty=1.0,
        )
        # tokens = batch['tgt']
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        decoded_preds = [x.strip() for x in decoded_preds]
        return decoded_preds

    def load_config(self, save_dir: str, filename='config.json', **kwargs):
        if os.path.isdir(save_dir):
            super().load_config(save_dir, filename, **kwargs)
            transformer = self.config.transformer
        else:
            self.config.transformer = transformer = save_dir
        self.transformer_config = AutoConfig_.from_pretrained(transformer)

    def load_vocabs(self, save_dir, filename='vocabs.json'):
        self.tokenizer = AMRBartTokenizer.from_pretrained(
            self.config.transformer,
            use_fast=True,
        )

    def load_weights(self, save_dir, filename='model.pt', **kwargs):
        pass


================================================
FILE: hanlp/components/amr/amrbart/bart_amr_parser.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-12-05 17:56
import logging
import os.path
from typing import Callable, Union, List

import datetime
import torch
from torch.utils.data import DataLoader

from hanlp.components.amr.amrbart.data_interface.dataset import AMRParsingDataSet
from hanlp.common.dataset import SortingSamplerBuilder, PadSequenceDataLoader
from hanlp.common.torch_component import TorchComponent
from hanlp.components.amr.seq2seq.dataset.dataset import AMRDataset
from hanlp.components.amr.seq2seq.dataset.penman import AMRGraph
from hanlp.components.amr.seq2seq.evaluation import write_predictions, compute_smatch
from hanlp.layers.transformers.pt_imports import AutoConfig_
from hanlp.metrics.amr.smatch_eval import smatch_eval
from hanlp.metrics.mtl import MetricDict
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.constant import IDX
from hanlp_common.util import reorder
from hanlp.components.amr.amrbart.model_interface.modeling_bart import BartForConditionalGeneration
from hanlp.components.amr.amrbart.model_interface.tokenization_bart import AMRBartTokenizer


class BART_AMR_Parser(TorchComponent):
    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
        self.tokenizer: AMRBartTokenizer = None
        self.transformer_config = None
        self.model: BartForConditionalGeneration = None

    def build_dataloader(self, data, batch_size=32, shuffle=False, device=None, logger: logging.Logger = None,
                         sampler_builder=None,
                         **kwargs) -> DataLoader:
        dataset = AMRDataset(data, generate_idx=True, cache=True)
        if isinstance(data, str):
            dataset.append_transform(lambda x: {**x, 'text': x['amr'].metadata['snt']})
        dataset.append_transform(
            lambda x: AMRParsingDataSet.tokenize(x, tokenizer=self.tokenizer, text='text')
        )
        if not sampler_builder:
            sampler_builder = SortingSamplerBuilder(batch_max_tokens=500)
        sampler = sampler_builder.build([len(x['input_ids']) for x in dataset], shuffle, 1)
        return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler,
                                     pad={'input_ids': self.transformer_config.pad_token_id,
                                          'labels': self.transformer_config.pad_token_id})

    def build_optimizer(self, **kwargs):
        pass

    def build_criterion(self, **kwargs):
        pass

    def build_metric(self, **kwargs):
        pass

    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
                              logger: logging.Logger, devices, ratio_width=None, **kwargs):
        pass

    def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
        pass

    def build_model(self, training=True, transformer=None, **kwargs) -> torch.nn.Module:
        model = BartForConditionalGeneration.from_pretrained(
            transformer,
            config=self.transformer_config,
        )
        if not training:
            model.eval()
        model.resize_token_embeddings(len(self.tokenizer))
        return model

    def input_is_flat(self, data):
        return isinstance(data, str)

    def predict(
            self,
            data: Union[str, List[str]], num_beams=5, max_length=1024, beautiful_amr_graph=True, verbose=False,
            **kwargs
    ):
        flat = self.input_is_flat(data)
        if flat:
            data = [data]
        dataloader = self.build_dataloader([{'text': x} for x in data], **self.config, device=self.device)
        orders = []
        results = []
        # inputs, logits, labels, loss = torch.load('/local/scratch/hhe43/amrbart/batch.pt')
        if verbose:
            timer = CountdownTimer(len(dataloader))
        for batch in dataloader:
            pieces = self.predict_batch(batch, num_beams, max_length)
            results.extend(pieces)
            orders.extend(batch[IDX])
            if verbose:
                # noinspection PyUnboundLocalVariable
                timer.log()
        results = reorder(results, orders)
        if flat:
            results = results[0]
        return results

    def predict_batch(self, batch, num_beams, max_length):
        tokenizer = self.tokenizer
        input_ids = batch['input_ids']
        preds = self.model.generate(
            input_ids,
            num_beams=num_beams,
            num_return_sequences=num_beams,
            use_cache=True,
            decoder_start_token_id=tokenizer.amr_bos_token_id,
            eos_token_id=tokenizer.amr_eos_token_id,
            no_repeat_ngram_size=0,
            max_length=max_length,
            min_length=0,
            length_penalty=1.0,
        ).tolist()
        # tokens = batch['tgt']
        graphs = []
        for i in range(0, len(preds), num_beams):
            graphs_same_source = []
            for j in range(i, i + num_beams):
                ith_pred = preds[j]
                ith_pred[0] = tokenizer.bos_token_id
                ith_pred = [
                    tokenizer.eos_token_id if itm == tokenizer.amr_eos_token_id else itm
                    for itm in ith_pred if itm != tokenizer.pad_token_id
                ]

                graph, status, (lin, backr) = tokenizer.decode_amr(
                    ith_pred, restore_name_ops=False
                )
                graph.status = status
                graph.nodes = lin
                graph.backreferences = backr
                graph.tokens = ith_pred
                graphs_same_source.append(graph)
            graphs_same_source[:] = \
                tuple(zip(*sorted(enumerate(graphs_same_source), key=lambda x: (x[1].status.value, x[0]))))[1]
            graphs.append(graphs_same_source)
        # assert len(graphs) == len(tokens), f"inconsistent lengths {len(graphs)} vs {len(tokens)}"
        # for idx, gps, snt in zip(batch[IDX], graphs, tokens):
        #     for gp in gps:
        #         gp.metadata = {"id": str(idx), "annotator": "bart-amr",
        #                        "snt": snt.replace("<AMR>", '').replace("</AMR>", '').strip()}
        pieces = [AMRGraph(g.triples, g.top, g.epidata, g.metadata) for g in [gs[0] for gs in graphs]]
        return pieces

    def load_config(self, save_dir: str, filename='config.json', **kwargs):
        if os.path.isdir(save_dir):
            super().load_config(save_dir, filename, **kwargs)
            transformer = self.config.transformer
        else:
            self.config.transformer = transformer = save_dir
        self.transformer_config = AutoConfig_.from_pretrained(transformer)

    def load_vocabs(self, save_dir, filename='vocabs.json'):
        self.tokenizer = AMRBartTokenizer.from_pretrained(
            self.config.transformer,
            use_fast=True,
        )

    def load_weights(self, save_dir, filename='model.pt', **kwargs):
        pass

    @torch.no_grad()
    def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, ratio_width=None,
                            logger=None, input=None, use_fast=False, num_beams=5, max_length=1024,
                            **kwargs):
        self.model.eval()
        timer = CountdownTimer(len(data))
        graphs = []
        orders = []
        smatch = 0
        for idx, batch in enumerate(data):
            graphs_per_batch = self.predict_batch(batch, num_beams, max_length)
            # Copy meta data from gold graph
            for gp, gg in zip(graphs_per_batch, batch['amr']):
                metadata = gg.metadata.copy()
                metadata['annotator'] = f'{self.transformer_config.name_or_path}-amr'
                metadata['date'] = str(datetime.datetime.now())
                if 'save-date' in metadata:
                    del metadata['save-date']
                gp.metadata = metadata
            graphs.extend(graphs_per_batch)
            orders.extend(batch[IDX])
            if idx == timer.total - 1:
                graphs = reorder(graphs, orders)
                write_predictions(output, None, graphs)
                try:
                    if use_fast:
                        smatch = compute_smatch(output, input)
                    else:
                        smatch = smatch_eval(output, input, use_fast=False)
                except:
                    pass
                timer.log(smatch.cstr() if isinstance(smatch, MetricDict) else f'{smatch:.2%}', ratio_percentage=False,
                          logger=logger)
            else:
                timer.log(ratio_percentage=False, logger=logger)

        return smatch

    def evaluate(self, tst_data, save_dir=None, logger: logging.Logger = None, batch_size=None, output=True, **kwargs):
        return super().evaluate(tst_data, save_dir, logger, batch_size, output, **kwargs)


================================================
FILE: hanlp/components/amr/amrbart/common/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-12-05 17:53


================================================
FILE: hanlp/components/amr/amrbart/common/constant.py
================================================
# coding:utf-8
# MIT License
#
# Copyright (c) 2022 xfbai
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    BartTokenizer,
    BartForConditionalGeneration,
    T5Tokenizer,
    T5Model,
    T5ForConditionalGeneration,
)
from transformers.optimization import (
    get_cosine_schedule_with_warmup,
    get_cosine_with_hard_restarts_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    get_polynomial_decay_schedule_with_warmup,
    get_constant_schedule_with_warmup,
)

raw_special_tokens = ['Ġcause-01', 'Ġpossible-01', 'Ġcontrast-01', 'Ġsay-01', 'Ġhave-03', 'Ġgovern-01', 'Ġstate-01',
                      'Ġthink-01', 'Ġdo-02', 'Ġwant-01', 'Ġknow-01', 'Ġrecommend-01', 'Ġsee-01', 'Ġresemble-01',
                      'Ġmean-01', 'Ġobligate-01', 'Ġuse-01', 'Ġgood-02', 'Ġneed-01', 'Ġwork-01', 'Ġpay-01', 'Ġget-01',
                      'Ġattack-01', 'Ġreal-04', 'Ġbelieve-01', 'Ġsupport-01', 'Ġreport-01', 'Ġtry-01', 'Ġsame-01',
                      'Ġtax-01', 'Ġoppose-01', 'Ġlive-01', 'Ġtell-01', 'Ġmake-02', 'Ġdie-01', 'Ġkill-01', 'Ġnew-01',
                      'Ġgive-01', 'Ġincrease-01', 'Ġagree-01', 'Ġactual-02', 'Ġgo-02', 'Ġright-05', 'Ġvote-01',
                      'Ġmake-01', 'Ġtake-01', 'Ġseem-01', 'Ġtalk-01', 'Ġissue-02', 'Ġbecome-01', 'Ġpost-01', 'Ġhelp-01',
                      'Ġstart-01', 'Ġend-01', 'Ġdevelop-02', 'Ġdecide-01', 'Ġfind-01', 'Ġclaim-01', 'Ġdefend-01',
                      'Ġlead-02', 'Ġhigh-02', 'Ġcontrol-01', 'Ġfree-04', 'Ġtraffic-01', 'Ġlong-03', 'Ġprovide-01',
                      'Ġcome-01', 'Ġplan-01', 'Ġproduce-01', 'Ġchange-01', 'Ġdiffer-02', 'Ġmarry-01', 'Ġemploy-01',
                      'Ġchoose-01', 'Ġfight-01', 'Ġmeet-03', 'Ġcall-01', 'Ġread-01', 'Ġunderstand-01', 'Ġsure-02',
                      'Ġcapable-01', 'Ġallow-01', 'Ġcrime-02', 'Ġinclude-01', 'Ġsell-01', 'Ġinfer-01', 'Ġshow-01',
                      'Ġfeel-01', 'Ġwar-01', 'Ġquestion-01', 'Ġlook-01', 'Ġopine-01', 'Ġlegal-02', 'Ġlose-02',
                      'Ġstop-01', 'Ġcreate-01', 'Ġcost-01', 'Ġcontinue-01', 'Ġbad-07', 'Ġact-02', 'Ġcare-03', 'Ġwin-01',
                      'Ġdiscuss-01', 'Ġdestroy-01', 'Ġpolicy-01', 'Ġelect-01', 'Ġgo-01', 'Ġtrue-01', 'Ġlie-08',
                      'Ġbase-02', 'Ġinsure-02', 'Ġinvest-01', 'Ġfund-01', 'Ġliberal-02', 'Ġtrade-01', 'Ġspeak-01',
                      'Ġinvolve-01', 'Ġfail-01', 'Ġhear-01', 'Ġlet-01', 'Ġhope-01', 'Ġinterest-01', 'Ġthreaten-01',
                      'Ġgrow-01', 'Ġdeal-01', 'Ġspend-01', 'Ġexist-01', 'Ġbegin-01', 'Ġdepend-01', 'Ġarrest-01',
                      'Ġprove-01', 'Ġbuy-01', 'Ġput-01', 'Ġget-05', 'Ġactivity-06', 'Ġoffer-01', 'Ġpersonal-02',
                      'Ġprotect-01', 'Ġquote-01', 'Ġwrite-01', 'Ġown-01', 'Ġbuild-01', 'Ġbenefit-01', 'Ġrelation-03',
                      'Ġequal-01', 'Ġsurrender-01', 'Ġexpect-01', 'Ġlike-01', 'Ġcooperate-01', 'Ġmove-01', 'Ġexcept-01',
                      'Ġrealize-01', 'Ġstrong-02', 'Ġhate-01', 'Ġargue-01', 'Ġask-01', 'Ġanswer-01', 'Ġlow-04',
                      'Ġcase-03', 'Ġresult-01', 'Ġeasy-05', 'Ġhard-02', 'Ġconcern-01', 'Ġsuspect-01', 'Ġbear-02',
                      'Ġserve-01', 'Ġaccept-01', 'Ġclear-06', 'Ġlove-01', 'Ġdemand-01', 'Ġlaunch-01', 'Ġexplain-01',
                      'Ġwrong-04', 'Ġright-06', 'Ġrequire-01', 'Ġaffect-01', 'Ġeffort-01', 'Ġforce-01', 'Ġlook-02',
                      'Ġwatch-01', 'Ġout-06', 'Ġoperate-01', 'Ġattempt-01', 'Ġban-01', 'Ġstudy-01', 'Ġsuggest-01',
                      'Ġlikely-01', 'Ġconcern-02', 'Ġthank-01', 'Ġpublic-02', 'Ġwork-09', 'Ġexemplify-01', 'Ġintend-01',
                      'Ġprice-01', 'Ġrespond-01', 'Ġpropose-01', 'Ġvisit-01', 'Ġcomplete-02', 'Ġtransfer-01',
                      'Ġaccuse-01', 'Ġcounter-01', 'Ġcut-02', 'Ġsimple-02', 'Ġcare-01', 'Ġcharge-05', 'Ġrepresent-01',
                      'Ġsucceed-01', 'Ġlocal-02', 'Ġmurder-01', 'Ġremember-01', 'Ġsend-01', 'Ġevidence-01',
                      'Ġresearch-01', 'Ġmajor-02', 'Ġwait-01', 'Ġestablish-01', 'Ġremain-01', 'Ġtest-01', 'Ġkeep-02',
                      'Ġexport-01', 'Ġannounce-01', 'Ġbomb-01', 'Ġfavor-01', 'Ġdeny-01', 'Ġrun-01', 'Ġexperience-01',
                      'Ġexpert-01', 'Ġprevent-01', 'Ġfair-01', 'Ġknow-02', 'Ġgeneral-02', 'Ġapprove-01', 'Ġwhite-02',
                      'Ġdescribe-01', 'Ġshare-01', 'Ġconsider-01', 'Ġcase-04', 'Ġreceive-01', 'Ġignore-01', 'Ġlink-01',
                      'Ġkeep-01', 'Ġcomment-01', 'Ġsex-01', 'Ġlaugh-01', 'Ġinvestigate-01', 'Ġview-02',
                      'Ġproliferate-01', 'Ġrefuse-01', 'Ġfear-01', 'Ġget-03', 'Ġwill-02', 'Ġrape-01', 'Ġallege-01',
                      'Ġget-04', 'Ġstay-01', 'Ġrise-01', 'Ġsupply-01', 'Ġdirect-02', 'Ġhonest-01', 'Ġdebate-01',
                      'Ġobvious-01', 'Ġappear-02', 'Ġcampaign-01', 'Ġblack-05', 'Ġreduce-01', 'Ġask-02',
                      'Ġcriticize-01', 'Ġguess-01', 'Ġlearn-01', 'Ġseek-01', 'Ġaccess-01', 'Ġsafe-01', 'Ġwish-01',
                      'Ġwrong-02', 'Ġeducate-01', 'Ġconflict-01', 'Ġrespect-01', 'Ġreach-01', 'Ġage-01', 'Ġmention-01',
                      'Ġexecute-01', 'Ġfind-02', 'Ġjudge-01', 'Ġbring-01', 'Ġblame-01', 'Ġhead-01', 'Ġwell-09',
                      'Ġensure-01', 'Ġarm-01', 'Ġcover-01', 'Ġserious-02', 'Ġtreat-01', 'Ġteach-01', 'Ġdoubt-01',
                      'Ġimmigrate-01', 'Ġinvade-01', 'Ġsmuggle-01', 'Ġlack-01', 'Ġearn-01', 'Ġhold-01', 'Ġlimit-01',
                      'Ġparticipate-01', 'Ġsentence-01', 'Ġdamage-01', 'Ġconsider-02', 'Ġname-01', 'Ġsorry-01',
                      'Ġrelate-01', 'Ġcriminal-03', 'Ġleft-19', 'Ġadmit-01', 'Ġadministrate-01', 'Ġtarget-01',
                      'Ġrun-02', 'Ġgo-06', 'Ġimprove-01', 'Ġconstruct-01', 'Ġmoral-02', 'Ġfollow-01', 'Ġcorrect-02',
                      'Ġprotest-01', 'Ġleave-11', 'Ġaid-01', 'Ġvalue-01', 'Ġsense-02', 'Ġdrop-01', 'Ġface-01',
                      'Ġserious-01', 'Ġseize-01', 'Ġtrain-01', 'Ġwarn-01', 'Ġavoid-01', 'Ġeffective-04', 'Ġdeserve-01',
                      'Ġplay-01', 'Ġenter-01', 'Ġregulate-01', 'Ġnear-02', 'Ġborder-01', 'Ġsolve-01', 'Ġprefer-01',
                      'Ġviolate-01', 'Ġrelease-01', 'Ġcite-01', 'Ġfocus-01', 'Ġadvise-01', 'Ġsound-01', 'Ġrisk-01',
                      'Ġreturn-01', 'Ġlist-01', 'Ġsignificant-02', 'Ġhire-01', 'Ġsurprise-01', 'Ġopen-01', 'Ġnice-01',
                      'Ġraise-01', 'Ġmaintain-01', 'Ġprivate-03', 'Ġimplement-01', 'Ġassist-01', 'Ġcall-02',
                      'Ġcompare-01', 'Ġprofit-01', 'Ġcontribute-01', 'Ġhave-to-do-with-04', 'Ġcorrupt-01', 'Ġclose-10',
                      'Ġsuffer-01', 'Ġexpand-01', 'Ġwonder-01', 'Ġresponsible-01', 'Ġtotal-01', 'Ġspecific-02',
                      'Ġpass-01', 'Ġhappy-01', 'Ġassume-02', 'Ġchance-02', 'Ġremove-01', 'Ġadd-02', 'Ġmanufacture-01',
                      'Ġexpress-01', 'Ġinspect-01', 'Ġwalk-01', 'Ġgood-03', 'Ġrule-01', 'Ġmanage-01', 'Ġhold-04',
                      'Ġspecial-02', 'Ġinfluence-01', 'Ġexchange-01', 'Ġtake-10', 'Ġconvict-01', 'Ġprocess-02',
                      'Ġtravel-01', 'Ġcarry-01', 'Ġdefine-01', 'Ġdisagree-01', 'Ġsave-02', 'Ġpermit-01', 'Ġestimate-01',
                      'Ġrate-01', 'Ġcall-03', 'Ġsingle-02', 'Ġabuse-01', 'Ġsign-01', 'Ġrule-03', 'Ġact-01',
                      'Ġachieve-01', 'Ġintervene-01', 'Ġfall-01', 'Ġattend-02', 'Ġfeel-02', 'Ġadopt-01', 'Ġfollow-02',
                      'Ġgo-on-15', 'Ġloan-01', 'Ġnegotiate-01', 'Ġhit-01', 'Ġcondition-01', 'Ġshort-07', 'Ġpromise-01',
                      'Ġrebel-01', 'Ġpromote-02', 'Ġstrengthen-01', 'Ġsanction-02', 'Ġwarm-01', 'Ġbehave-01',
                      'Ġhave-06', 'Ġsuffice-01', 'Ġlead-03', 'Ġtry-02', 'Ġlike-02', 'Ġfire-01', 'Ġdrive-01', 'Ġfly-01',
                      'Ġgain-02', 'Ġafford-01', 'Ġexplode-01', 'Ġpoint-out-02', 'Ġconsume-01', 'Ġmeasure-02',
                      'Ġreform-01', 'Ġenjoy-01', 'Ġsit-01', 'Ġavailable-02', 'Ġstrike-01', 'Ġsign-02', 'Ġcome-03',
                      'Ġnatural-03', 'Ġorganize-01', 'Ġprepare-02', 'Ġreplace-01', 'Ġhanging-07', 'Ġleave-15',
                      'Ġretire-01', 'Ġimport-01', 'Ġrange-01', 'Ġokay-04', 'Ġcover-03', 'Ġimagine-01', 'Ġkey-02',
                      'Ġsurvive-01', 'Ġfree-03', 'Ġbase-01', 'Ġcomplain-01', 'Ġnormal-02', 'Ġcomplete-01', 'Ġreveal-01',
                      'Ġenforce-01', 'Ġdetermine-01', 'Ġvictimize-01', 'Ġrepeat-01', 'Ġinterview-01', 'Ġmake-05',
                      'Ġdonate-01', 'Ġsteal-01', 'Ġquick-02', 'Ġattract-01', 'Ġanalyze-01', 'Ġally-01', 'Ġsuppose-01',
                      'Ġresponsible-03', 'Ġclose-01', 'Ġcombat-01', 'Ġidentify-01', 'Ġsuppose-02', 'Ġrecord-01',
                      'Ġnominate-01', 'Ġrely-01', 'Ġturn-02', 'Ġhandle-01', 'Ġprocess-01', 'Ġpredict-01', 'Ġdeploy-01',
                      'Ġfortunate-01', 'Ġeat-01', 'Ġjustify-01', 'Ġexpend-01', 'Ġbullshit-01', 'Ġdiscover-01',
                      'Ġenrich-01', 'Ġcommit-02', 'Ġshoot-02', 'Ġcheap-02', 'Ġreject-01', 'Ġweak-02', 'Ġpowerful-02',
                      'Ġdispute-01', 'Ġlegislate-01', 'Ġissue-01', 'Ġarrive-01', 'Ġjoin-01', 'Ġapply-02',
                      'Ġindicate-01', 'Ġengage-01', 'Ġinnocent-01', 'Ġfast-02', 'Ġpressure-01', 'Ġpublish-01',
                      'Ġobtain-01', 'Ġsad-02', 'Ġconfirm-01', 'Ġtreat-03', 'Ġlead-01', 'Ġlisten-01', 'Ġoffend-01',
                      'Ġaddress-02', 'Ġword-01', 'Ġright-08', 'Ġnote-01', 'Ġcontain-01', 'Ġpurchase-01', 'Ġrequest-01',
                      'Ġgood-04', 'Ġdesign-01', 'Ġnotice-01', 'Ġpresent-01', 'Ġshock-01', 'Ġright-02', 'Ġtransport-01',
                      'Ġdeliver-01', 'Ġburn-01', 'Ġfault-01', 'Ġmatter-01', 'Ġabort-01', 'Ġstick-01', 'Ġconnect-01',
                      'Ġconclude-01', 'Ġcontract-02', 'Ġpossess-01', 'Ġend-up-03', 'Ġsearch-01', 'Ġget-02',
                      'Ġqualify-02', 'Ġreact-01', 'Ġconfuse-01', 'Ġanger-01', 'Ġpursue-01', 'Ġreside-01',
                      'Ġrelevant-01', 'Ġoccupy-01', 'Ġwithdraw-01', 'Ġokay-01', 'Ġconform-01', 'Ġdemonstrate-01',
                      'Ġwear-01', 'Ġhave-04', 'Ġdecrease-01', 'Ġpunish-01', 'Ġpractice-01', 'Ġcapture-01', 'Ġgo-03',
                      'Ġpoll-01', 'Ġshow-04', 'Ġrefer-01', 'Ġcommit-01', 'Ġdisarm-01', 'Ġbelong-01', 'Ġdivide-02',
                      'Ġdrink-01', 'Ġdesire-01', 'Ġsave-01', 'Ġignorant-02', 'Ġperfect-02', 'Ġposition-02', 'Ġcrap-01',
                      'Ġinsult-01', 'Ġprivate-02', 'Ġwaste-01', 'Ġguilty-01', 'Ġeliminate-01', 'Ġmortgage-01',
                      'Ġworth-01', 'Ġinherit-01', 'Ġthrow-01', 'Ġtour-01', 'Ġsuspend-01', 'Ġharm-01', 'Ġimpose-01',
                      'Ġimprison-01', 'Ġrecognize-01', 'Ġprosecute-01', 'Ġview-01', 'Ġforget-01', 'Ġfound-01',
                      'Ġchallenge-01', 'Ġtrouble-01', 'Ġsecure-02', 'Ġorder-01', 'Ġpartner-01', 'Ġspend-02',
                      'Ġprogressive-02', 'Ġaccount-01', 'Ġblock-01', 'Ġguarantee-01', 'Ġconvince-01', 'Ġworry-02',
                      'Ġendanger-01', 'Ġmovement-07', 'Ġfuck-01', 'Ġextend-01', 'Ġseparate-02', 'Ġbalance-01',
                      'Ġlose-03', 'Ġpower-01', 'Ġsue-02', 'Ġurge-01', 'Ġcheck-01', 'Ġpoint-01', 'Ġturn-01',
                      'Ġprogress-01', 'Ġrecover-01', 'Ġridiculous-02', 'Ġaccompany-01', 'Ġappear-01', 'Ġworry-01',
                      'Ġplace-01', 'Ġattend-01', 'Ġsleep-01', 'Ġbreak-01', 'Ġfind-out-03', 'Ġbias-01', 'Ġaccord-03',
                      'Ġwide-02', 'Ġenable-01', 'Ġaffair-02', 'Ġhide-01', 'Ġhold-02', 'Ġrecognize-02', 'Ġback-01',
                      'Ġbet-01', 'Ġhack-04', 'Ġacquire-01', 'Ġtake-04', 'Ġpenalize-01', 'Ġmessage-01', 'Ġready-02',
                      'Ġcease-01', 'Ġcrazy-03', 'Ġbad-04', 'Ġcompete-02', 'Ġcontact-01', 'Ġsource-01', 'Ġset-up-03',
                      'Ġrestrict-01', 'Ġregard-01', 'Ġwitness-01', 'Ġlabor-01', 'Ġsmoke-02', 'Ġkick-01', 'Ġcompete-01',
                      'Ġhouse-01', 'Ġhurt-01', 'Ġimprovise-01', 'Ġfinance-01', 'Ġinsist-01', 'Ġfarm-01', 'Ġapply-01',
                      'Ġstep-01', 'Ġdeep-02', 'Ġpride-01', 'Ġbill-01', 'Ġpretend-01', 'Ġfill-01', 'Ġfine-04',
                      'Ġstop-03', 'Ġoffend-03', 'Ġadvertise-01', 'Ġstand-01', 'Ġaim-02', 'Ġimpact-01', 'Ġfeed-01',
                      'Ġgrant-01', 'Ġlast-01', 'Ġform-01', 'Ġdrive-02', 'Ġengineer-01', 'Ġinjure-01', 'Ġdevelop-01',
                      'Ġpresent-02', 'Ġsubsidize-01', 'Ġbring-up-02', 'Ġintelligent-01', 'Ġwelcome-01', 'Ġtake-away-05',
                      'Ġresolve-01', 'Ġappropriate-02', 'Ġencourage-01', 'Ġperform-02', 'Ġgo-back-19', 'Ġdeclare-02',
                      'Ġfull-09', 'Ġhopeful-03', 'Ġconduct-01', 'Ġsurgery-01', 'Ġdetain-01', 'Ġrelative-05',
                      'Ġcount-01', 'Ġglad-02', 'Ġrare-02', 'Ġcome-out-09', 'Ġapproach-02', 'Ġrace-02', 'Ġbattle-01',
                      'Ġcross-02', 'Ġmove-02', 'Ġquestion-03', 'Ġadminister-01', 'Ġgrow-03', 'Ġmeet-02', 'Ġdown-03',
                      'Ġmeet-01', 'Ġcondemn-01', 'Ġreason-01', 'Ġcarry-out-03', 'Ġworth-02', 'Ġinform-01', 'Ġstable-03',
                      'Ġstand-11', 'Ġutilize-01', 'Ġperpetrate-01', 'Ġassociate-01', 'Ġapologize-01', 'Ġcredit-01',
                      'Ġdisgust-01', 'Ġspread-03', 'Ġcommand-02', 'Ġsense-01', 'Ġdetail-01', 'Ġdefeat-01',
                      'Ġdistribute-01', 'Ġgive-up-07', 'Ġpain-01', 'Ġship-01', 'Ġkeep-04', 'Ġaddict-01',
                      'Ġcompromise-01', 'Ġlegitimate-02', 'Ġregular-02', 'Ġpick-01', 'Ġsource-02', 'Ġraid-01',
                      'Ġhard-04', 'Ġrain-01', 'Ġcommunicate-01', 'Ġmarket-01', 'Ġlower-05', 'Ġill-01', 'Ġdefraud-01',
                      'Ġposition-01', 'Ġterrible-01', 'Ġdivorce-01', 'Ġamaze-01', 'Ġedit-01', 'Ġspread-02',
                      'Ġclarify-10', 'Ġargue-02', 'Ġpush-01', 'Ġmiss-01', 'Ġimply-01', 'Ġdiscriminate-02', 'Ġlight-06',
                      'Ġappoint-01', 'Ġdelay-01', 'Ġgross-03', 'Ġput-03', 'Ġintroduce-02', 'Ġstandard-02', 'Ġpull-01',
                      'Ġdraw-02', 'Ġgo-08', 'Ġaim-01', 'Ġmodern-02', 'Ġdare-01', 'Ġneighbor-01', 'Ġconfront-01',
                      'Ġsuperior-01', 'Ġreasonable-02', 'Ġschedule-01', 'Ġadd-01', 'Ġnew-02', 'Ġlend-01', 'Ġdouble-01',
                      'Ġfinish-01', 'Ġraise-03', 'Ġexcuse-02', 'Ġmonitor-01', 'Ġobserve-01', 'Ġpopular-02',
                      'Ġcharge-01', 'Ġbudget-01', 'Ġnegative-03', 'Ġdirect-01', 'Ġrid-01', 'Ġmake-18', 'Ġmean-02',
                      'Ġfame-01', 'Ġjoke-01', 'Ġbeautiful-02', 'Ġtend-02', 'Ġrob-01', 'Ġriot-01', 'Ġsponsor-01',
                      'Ġentitle-01', 'Ġlobby-01', 'Ġbad-02', 'Ġcollapse-01', 'Ġexpose-01', 'Ġemphasize-01',
                      'Ġfriendly-01', 'Ġplay-02', 'Ġinitiate-01', 'Ġappreciate-02', 'Ġremind-01', 'Ġblack-04',
                      'Ġefficient-01', 'Ġconverse-01', 'Ġresponsible-02', 'Ġmeasure-01', 'Ġcome-04', 'Ġeffect-03',
                      'Ġsubject-01', 'Ġmistake-02', 'Ġpass-03', 'Ġsignal-07', 'Ġguard-01', 'Ġopen-04', 'Ġset-02',
                      'Ġfun-01', 'Ġcome-up-11', 'Ġflee-05', 'Ġlabel-01', 'Ġsize-01', 'Ġconfident-01', 'Ġsmart-06',
                      'Ġhost-01', 'Ġtough-02', 'Ġrecall-02', 'Ġscare-01', 'Ġdream-01', 'Ġassault-01', 'Ġfreeze-02',
                      'Ġtake-over-12', 'Ġrecession-02', 'Ġfunction-01', 'Ġwhine-01', 'Ġshort-06', 'Ġprosper-01',
                      'Ġadvanced-02', 'Ġvalue-02', 'Ġbother-01', 'Ġcomply-01', 'Ġright-04', 'Ġrevolution-03',
                      'Ġaccomplish-01', 'Ġgo-out-17', 'Ġfigure-out-05', 'Ġslow-05', 'Ġaccountable-02', 'Ġcool-01',
                      'Ġdocument-01', 'Ġauthorize-01', 'Ġembargo-01', 'Ġvolunteer-01', 'Ġregister-02', 'Ġfrequent-02',
                      'Ġrank-01', 'Ġresist-01', 'Ġbreak-up-08', 'Ġred-02', 'Ġcomfortable-02', 'Ġexamine-01',
                      'Ġadjust-01', 'Ġoriginate-01', 'Ġreply-01', 'Ġbreak-18', 'Ġshoot-01', 'Ġmiss-02', 'Ġdismiss-01',
                      'Ġcollect-01', 'Ġdraft-01', 'Ġsubmit-01', 'Ġrelieve-01', 'Ġembarrass-01', 'Ġreturn-02',
                      'Ġvoluntary-02', 'Ġpure-02', 'Ġbeat-01', 'Ġbear-01', 'Ġvary-01', 'Ġsick-05', 'Ġaffair-01',
                      'Ġtypical-02', 'Ġnegative-02', 'Ġserve-02', 'Ġeradicate-01', 'Ġrealize-02', 'Ġperceive-01',
                      'Ġleave-14', 'Ġgive-16', 'Ġback-up-04', 'Ġgenerate-01', 'Ġbail-out-02', 'Ġtouch-01',
                      'Ġcultivate-01', 'Ġconvert-01', 'Ġdismantle-01', 'Ġservice-05', 'Ġstraight-04', 'Ġbad-05',
                      'Ġforce-04', 'Ġadvocate-01', 'Ġpray-01', 'Ġdecline-01', 'Ġinfect-01', 'Ġtitle-01',
                      'Ġdesperate-02', 'Ġupset-01', 'Ġtolerate-01', 'Ġprohibit-01', 'Ġmind-05', 'Ġbeat-03', 'Ġveto-01',
                      'Ġcrash-01', 'Ġside-01', 'Ġcombine-01', 'Ġclose-13', 'Ġgo-10', 'Ġequip-01', 'Ġrant-01',
                      'Ġjail-01', 'Ġcopy-01', 'Ġdrop-05', 'Ġconsistent-02', 'Ġspend-04', 'Ġsend-03', 'Ġcritical-02',
                      'Ġcarry-on-02', 'Ġraise-02', 'Ġmotivate-01', 'Ġguide-01', 'Ġwonderful-03', 'Ġtrust-01',
                      'Ġreverse-01', 'Ġjust-02', 'Ġclaim-02', 'Ġsurvey-01', 'Ġspy-01', 'Ġget-22', 'Ġhave-05',
                      'Ġcool-04', 'Ġpicture-01', 'Ġunion-02', 'Ġmanage-02', 'Ġinstruct-01', 'Ġblow-03', 'Ġsacrifice-01',
                      'Ġowe-01', 'Ġappeal-01', 'Ġexceed-01', 'Ġradiate-01', 'Ġhonor-01', 'Ġseparate-01', 'Ġarrange-01',
                      'Ġdominate-01', 'Ġtransact-01', 'Ġgrow-up-04', 'Ġverify-01', 'Ġgo-05', 'Ġfamiliarize-01',
                      'Ġrenew-01', 'Ġfire-02', 'Ġtake-out-11', 'Ġinterpret-01', 'Ġvalid-02', 'Ġshow-up-02',
                      'Ġconfiscate-01', 'Ġshut-down-05', 'Ġcheat-03', 'Ġharass-01', 'Ġtie-01', 'Ġabuse-02',
                      'Ġassess-01', 'Ġcompensate-01', 'Ġsensitive-03', 'Ġsettle-02', 'Ġencounter-01', 'Ġmatch-01',
                      'Ġrecover-02', 'Ġtrust-02', 'Ġperform-01', 'Ġborrow-01', 'Ġselect-01', 'Ġbetray-01', 'Ġride-01',
                      'Ġuseful-05', 'Ġsplit-01', 'Ġshift-01', 'Ġannoy-01', 'Ġmind-01', 'Ġfair-04', 'Ġoppress-01',
                      'Ġinterfere-01', 'Ġcredit-02', 'Ġlaunder-01', 'Ġamount-01', 'Ġleave-13', 'Ġrescue-01',
                      'Ġstaff-01', 'Ġplay-11', 'Ġkind-01', 'Ġauthor-01', 'Ġsympathize-01', 'Ġupgrade-02',
                      'Ġsuppress-01', 'Ġwake-up-02', 'Ġinvite-01', 'Ġcome-12', 'Ġdeter-01', 'Ġbrainwash-01', 'Ġshit-01',
                      'Ġfix-02', 'Ġwhite-03', 'Ġgroup-01', 'Ġabsent-01', 'Ġarmor-01', 'Ġup-03', 'Ġpraise-01',
                      'Ġreview-01', 'Ġdry-02', 'Ġintercept-01', 'Ġbroadcast-01', 'Ġworship-01', 'Ġterm-01',
                      'Ġobject-01', 'Ġpledge-01', 'Ġprepare-01', 'Ġopen-up-03', 'Ġlay-01', 'Ġfile-01', 'Ġcheck-out-05',
                      'Ġattach-01', 'Ġsatisfy-01', 'Ġdepart-01', 'Ġopposite-01', 'Ġworsen-01', 'Ġaward-01',
                      'Ġpollute-01', 'Ġretaliate-01', 'Ġdisrupt-01', 'Ġreturn-05', 'Ġpopulate-01', 'Ġenvision-01',
                      'Ġplease-01', 'Ġrepair-01', 'Ġslaughter-01', 'Ġsin-01', 'Ġconstitute-01', 'Ġshop-01',
                      'Ġtranslate-01', 'Ġassure-01', 'Ġpay-off-02', 'Ġstimulate-01', 'Ġdamn-01', 'Ġswitch-01',
                      'Ġdisappear-01', 'Ġreelect-01', 'Ġspin-03', 'Ġtestify-01', 'Ġlegalize-01', 'Ġprint-01',
                      'Ġaverage-01', 'Ġright-03', 'Ġfix-03', 'Ġundermine-01', 'Ġcome-on-25', 'Ġlicense-01',
                      'Ġindict-01', 'Ġtransit-01', 'Ġwash-01', 'Ġbreathe-01', 'Ġbroad-02', 'Ġleave-17', 'Ġorder-02',
                      'Ġhead-02', 'Ġsing-01', 'Ġentertain-01', 'Ġcomplicate-01', 'Ġpush-02', 'Ġrealistic-03',
                      'Ġdisappoint-01', 'Ġbother-02', 'Ġtough-03', 'Ġdisplay-01', 'Ġflow-01', 'Ġdiffer-01', 'Ġlie-07',
                      'Ġpremise-01', 'Ġrelocate-01', 'Ġcorrect-01', 'Ġcoordinate-01', 'Ġabandon-01', 'Ġdictate-01',
                      'Ġplay-08', 'Ġrebuild-01', 'Ġclean-04', 'Ġwork-out-02', 'Ġrun-13', 'Ġcurious-01', 'Ġpromote-01',
                      'Ġspecialize-01', 'Ġstarve-01', 'Ġshame-02', 'Ġfit-06', 'Ġflaw-01', 'Ġfigure-01', 'Ġhunt-01',
                      'Ġexperiment-01', 'Ġmix-01', 'Ġregular-03', 'Ġfree-01', 'Ġdeclare-01', 'Ġescape-01', 'Ġput-02',
                      'Ġobsess-01', 'Ġbuild-up-05', 'Ġshut-up-06', 'Ġrally-01', 'Ġdissent-01', 'Ġprogram-01',
                      'Ġamend-01', 'Ġinvent-01', 'Ġleak-01', 'Ġtrigger-01', 'Ġdistinguish-01', 'Ġsymbolize-01',
                      'Ġexcellent-02', 'Ġlook-04', 'Ġcry-02', 'Ġassign-01', 'Ġrecruit-01', 'Ġcope-01', 'Ġmigrate-01',
                      'Ġtake-on-09', 'Ġbless-01', 'Ġsharp-02', 'Ġuse-02', 'Ġdisturb-01', 'Ġconsult-01', 'Ġlay-off-02',
                      'Ġbid-01', 'Ġaccord-02', 'Ġbusy-01', 'Ġprovoke-01', 'Ġisolate-01', 'Ġdirty-02', 'Ġblind-02',
                      'Ġstage-01', 'Ġboost-01', 'Ġoutrage-01', 'Ġtrack-01', 'Ġretard-01', 'Ġexclude-01', 'Ġpatent-01',
                      'Ġblog-01', 'Ġtorture-01', 'Ġplot-01', 'Ġcut-01', 'Ġhunger-01', 'Ġoverwhelm-01', 'Ġexploit-01',
                      'Ġland-01', 'Ġreserve-01', 'Ġbetter-01', 'Ġup-02', 'Ġremark-01', 'Ġpiss-03', 'Ġexcuse-01',
                      'Ġparalyze-01', 'Ġsummarize-01', 'Ġload-01', 'Ġdevote-01', 'Ġbury-01', 'Ġsurround-01',
                      'Ġdance-01', 'Ġdistort-01', 'Ġretain-01', 'Ġoverthrow-01', 'Ġrival-01', 'Ġready-01', 'Ġevolve-01',
                      'Ġimpoverish-01', 'Ġalarm-01', 'Ġunify-01', 'Ġrepay-01', 'Ġassume-01', 'Ġclose-06', 'Ġadmire-01',
                      'Ġvow-01', 'Ġaverage-04', 'Ġsight-01', 'Ġinflate-01', 'Ġreference-04', 'Ġlook-up-05',
                      'Ġcivilize-01', 'Ġsuitable-04', 'Ġdetect-01', 'Ġpiss-off-02', 'Ġassassinate-01', 'Ġopen-05',
                      'Ġshave-01', 'Ġemail-01', 'Ġfuel-01', 'Ġincentivize-01', 'Ġmark-01', 'Ġsustain-01',
                      'Ġspeculate-01', 'Ġsurveil-01', 'Ġswim-01', 'Ġconquer-01', 'Ġgenocide-01', 'Ġhoax-01',
                      'Ġnotice-03', 'Ġbe-done-08', 'Ġopt-01', 'Ġbait-01', 'Ġcompile-01', 'Ġinnovate-01', 'Ġallocate-01',
                      'Ġshelter-01', 'Ġcontrary-01', 'Ġburden-01', 'Ġfreeze-01', 'Ġinspire-01', 'Ġgraduate-01',
                      'Ġwipe-out-02', 'Ġfall-05', 'Ġcover-up-04', 'Ġrepute-01', 'Ġenhance-01', 'Ġclassify-01',
                      'Ġgreen-03', 'Ġscore-01', 'Ġmodify-01', 'Ġreflect-01', 'Ġforce-02', 'Ġequate-01',
                      'Ġmerchandise-01', 'Ġregret-01', 'Ġovercome-01', 'Ġprocure-01', 'Ġscam-01', 'Ġquit-01',
                      'Ġdrill-01', 'Ġdisable-01', 'Ġgrasp-01', 'Ġorbit-01', 'Ġlaughable-03', 'Ġconsent-01',
                      'Ġendorse-01', 'Ġcatch-02', 'Ġleave-02', 'Ġweigh-01', 'Ġroll-01', 'Ġrestore-01', 'Ġshape-01',
                      'Ġcomprehend-01', 'Ġtrip-03', 'Ġget-away-08', 'Ġsingle-03', 'Ġphone-01', 'Ġintimidate-01',
                      'Ġinstall-01', 'Ġsuck-03', 'Ġback-02', 'Ġdeem-01', 'Ġmake-up-10', 'Ġplant-01', 'Ġhand-out-03',
                      'Ġgo-off-16', 'Ġspeed-01', 'Ġrefute-01', 'Ġimplicate-01', 'Ġdock-01', 'Ġcrack-down-06',
                      'Ġforecast-01', 'Ġrush-01', 'Ġgenerous-01', 'Ġunite-01', 'Ġgrab-01', 'Ġcompetent-01',
                      'Ġground-02', 'Ġevaluate-01', 'Ġadvance-01', 'Ġmainstream-02', 'Ġdiagnose-01', 'Ġpass-05',
                      'Ġuphold-01', 'Ġhalt-01', 'Ġhinder-01', 'Ġbefriend-01', 'Ġconvene-01', 'Ġawe-01', 'Ġapplaud-01',
                      'Ġmodernize-01', 'Ġintegrate-01', 'Ġexecute-02', 'Ġwound-01', 'Ġprostitute-01', 'Ġexercise-01',
                      'Ġbind-01', 'Ġphotograph-01', 'Ġfascinate-01', 'Ġreward-01', 'Ġclean-up-02', 'Ġrepeal-01',
                      'Ġtwist-01', 'Ġmodel-01', 'Ġmandate-01', 'Ġconspire-01', 'Ġtear-01', 'Ġbrutal-02', 'Ġcharge-08',
                      'Ġdry-08', 'Ġwow-01', 'Ġbank-01', 'Ġfuck-up-02', 'Ġstand-up-07', 'Ġportray-01', 'Ġnationalize-01',
                      'Ġliberate-01', 'Ġexempt-01', 'Ġdefy-01', 'Ġshout-01', 'Ġdevastate-01', 'Ġhijack-01',
                      'Ġacknowledge-01', 'Ġcompromise-02', 'Ġconsist-01', 'Ġcoach-01', 'Ġintense-02', 'Ġdrag-01',
                      'Ġminor-01', 'Ġfulfill-01', 'Ġclear-01', 'Ġdeceive-01', 'Ġshake-01', 'Ġcold-01', 'Ġalign-01',
                      'Ġsupervise-01', 'Ġinternal-02', 'Ġgift-01', 'Ġstruggle-01', 'Ġcast-01', 'Ġfeature-01',
                      'Ġharsh-02', 'Ġemerge-01', 'Ġfollow-04', 'Ġcut-off-04', 'Ġmistake-01', 'Ġlocate-01', 'Ġslow-01',
                      'Ġaccelerate-01', 'Ġcover-02', 'Ġsoft-02', 'Ġidentical-01', 'Ġsail-01', 'Ġjump-03',
                      'Ġfacilitate-01', 'Ġexcessive-02', 'Ġalter-01', 'Ġescalate-01', 'Ġmad-04', 'Ġkid-01', 'Ġfloat-01',
                      'Ġmess-up-02', 'Ġkidnap-01', 'Ġbore-02', 'Ġclean-01', 'Ġforgive-01', 'Ġgo-through-20', 'Ġcare-04',
                      'Ġmeet-up-04', 'Ġmoisturize-01', 'Ġhighlight-01', 'Ġdislike-01', 'Ġboom-02', 'Ġblow-up-06',
                      'Ġappeal-02', 'Ġadhere-02', 'Ġcontradict-01', 'Ġleave-12', 'Ġdialogue-01', 'Ġpush-04',
                      'Ġcontaminate-01', 'Ġfinalize-01', 'Ġtape-02', 'Ġpatrol-01', 'Ġincite-01', 'Ġrenounce-01',
                      'Ġhallucinate-01', 'Ġundertake-01', 'Ġaverage-03', 'Ġcompel-01', 'Ġstruggle-02', 'Ġgo-12',
                      'Ġtrap-01', 'Ġquiet-04', 'Ġconvey-01', 'Ġopen-02', 'Ġclothe-01', 'Ġexclusive-02', 'Ġgather-03',
                      'Ġextensive-03', 'Ġapproach-01', 'Ġmanipulate-02', 'Ġinfringe-01', 'Ġruin-01', 'Ġstrive-01',
                      'Ġproductive-03', 'Ġexplore-01', 'Ġinhabit-01', 'Ġpress-01', 'Ġforbid-01', 'Ġhit-02',
                      'Ġabolish-01', 'Ġimpress-01', 'Ġprospect-02', 'Ġgoogle-01', 'Ġsink-01', 'Ġresign-01',
                      'Ġpull-out-02', 'Ġstation-01', 'Ġcenter-02', 'Ġindustrialize-01', 'Ġcounsel-01', 'Ġpropel-01',
                      'Ġsmell-01', 'Ġmoderate-03', 'Ġpresume-01', 'Ġrun-09', 'Ġkeep-up-10', 'Ġdeal-03', 'Ġapprehend-01',
                      'Ġsick-02', 'Ġsmell-02', 'Ġhave-11', 'Ġfrustrate-01', 'Ġcatch-01', 'Ġimpression-03',
                      'Ġspecify-01', 'Ġemploy-02', 'Ġthankful-02', 'Ġman-01', 'Ġprioritize-01', 'Ġattribute-01',
                      'Ġproject-01', 'Ġparrot-01', 'Ġbitch-01', 'Ġstand-04', 'Ġvoice-01', 'Ġpreserve-01',
                      'Ġpublicize-01', 'Ġexhibit-01', 'Ġundergo-28', 'Ġhelp-02', 'Ġbankrupt-01', 'Ġflood-01',
                      'Ġprecede-01', 'Ġreinforce-01', 'Ġtask-01', 'Ġtype-03', 'Ġtransform-01', 'Ġdespair-01',
                      'Ġchase-01', 'Ġspread-01', 'Ġappall-01', 'Ġrestrain-01', 'Ġterrify-01', 'Ġfool-01', 'Ġaspire-01',
                      'Ġwarm-07', 'Ġbring-up-08', 'Ġbleed-01', 'Ġdepress-01', 'Ġcare-02', 'Ġalert-01', 'Ġwonder-02',
                      'Ġdrop-out-04', 'Ġspoil-01', 'Ġstink-01', 'Ġdrug-01', 'Ġoverturn-01', 'Ġheat-01', 'Ġmerge-01',
                      'Ġpeak-01', 'Ġset-01', 'Ġsolid-02', 'Ġinteract-01', 'Ġthrow-out-06', 'Ġholiday-01', 'Ġrefine-01',
                      'Ġallow-02', 'Ġsign-up-03', 'Ġbribe-01', 'Ġappease-01', 'Ġstress-02', 'Ġfine-01', 'Ġminor-02',
                      'Ġmine-01', 'Ġlove-02', 'Ġnetwork-01', 'Ġdeposit-01', 'Ġstore-01', 'Ġextract-01',
                      'Ġinterrogate-01', 'Ġturn-out-11', 'Ġimpregnate-01', 'Ġfake-02', 'Ġwhore-01', 'Ġconceal-01',
                      'Ġfire-03', 'Ġlean-01', 'Ġharmful-02', 'Ġout-05', 'Ġfall-07', 'Ġdodge-01', 'Ġorient-01',
                      'Ġbrand-01', 'Ġsocial-03', 'Ġcut-03', 'Ġcap-01', 'Ġoverpay-01', 'Ġbridge-01', 'Ġcollaborate-01',
                      'Ġaddress-03', 'Ġdivert-01', 'Ġpull-09', 'Ġrevise-01', 'Ġmolest-01', 'Ġextradite-01',
                      'Ġdismiss-02', 'Ġreprocess-01', 'Ġaccumulate-01', 'Ġoccasion-02', 'Ġobstruct-01',
                      'Ġbreak-down-12', 'Ġrumor-01', 'Ġfirm-03', 'Ġsettle-03', 'Ġorder-03', 'Ġstipulate-01',
                      'Ġaudit-01', 'Ġenact-01', 'Ġcelebrate-02', 'Ġbargain-01', 'Ġsucceed-03', 'Ġinject-01',
                      'Ġexcite-01', 'Ġgreet-01', 'Ġblack-07', 'Ġterminate-01', 'Ġdescend-01', 'Ġemerge-02', 'Ġwreck-01',
                      'Ġabsorb-01', 'Ġblow-01', 'Ġfine-03', 'Ġcirculate-01', 'Ġtight-05', 'Ġoffense-02', 'Ġactivate-01',
                      'Ġsecure-01', 'Ġpass-by-17', 'Ġbash-01', 'Ġprop-up-01', 'Ġcount-04', 'Ġslap-01', 'Ġbring-down-03',
                      'Ġamuse-01', 'Ġfilm-01', 'Ġintroduce-01', 'Ġdesignate-01', 'Ġhang-01', 'Ġwave-04',
                      'Ġprivilege-01', 'Ġtake-02', 'Ġcycle-02', 'Ġcancel-01', 'Ġbuy-05', 'Ġsweep-01', 'Ġhelp-out-03',
                      'Ġleft-20', 'Ġsuit-01', 'Ġenslave-01', 'Ġrest-01', 'Ġambush-01', 'Ġmean-04', 'Ġdistract-01',
                      'Ġmatch-03', 'Ġwarrant-01', 'Ġdisguise-01', 'Ġmake-up-07', 'Ġparty-01', 'Ġclose-11', 'Ġfall-10',
                      'Ġpump-01', 'Ġresort-01', 'Ġget-back-10', 'Ġregain-01', 'Ġlose-01', 'Ġerr-01', 'Ġrun-out-05',
                      'Ġthat-is-it-00', 'Ġaggravate-01', 'Ġloot-01', 'Ġhappen-02', 'Ġscrew-02', 'Ġmake-it-14',
                      'Ġpick-up-04', 'Ġrefer-02', 'Ġbreak-13', 'Ġupdate-01', 'Ġshine-01', 'Ġcongratulate-01',
                      'Ġpilot-01', 'Ġdisgrace-01', 'Ġfabricate-01', 'Ġsicken-01', 'Ġcriticism-04', 'Ġpreach-01',
                      'Ġdeport-01', 'Ġdeal-02', 'Ġinflict-01', 'Ġgain-01', 'Ġresume-01', 'Ġoutlaw-01', 'Ġshoot-down-05',
                      'Ġpartition-01', 'Ġaddress-01', 'Ġenvy-01', 'Ġbreak-02', 'Ġspeak-out-03', 'Ġbroaden-01',
                      'Ġstress-01', 'Ġinfiltrate-01', 'Ġflat-06', 'Ġimpeach-01', 'Ġtransgress-01', 'Ġpardon-01',
                      'Ġuncover-01', 'Ġcomprise-01', 'Ġreconstruct-01', 'Ġlibel-01', 'Ġhand-01', 'Ġhint-01',
                      'Ġencourage-02', 'Ġprevail-02', 'Ġbrave-02', 'Ġforesee-01', 'Ġconcede-01', 'Ġdeteriorate-01',
                      'Ġtopple-01', 'Ġmobile-02', 'Ġpanic-01', 'Ġmisunderstand-01', 'Ġtire-01', 'Ġenthusiastic-03',
                      'Ġexercise-02', 'Ġpersist-01', 'Ġinferior-01', 'Ġbrilliant-01', 'Ġbuild-02', 'Ġscream-01',
                      'Ġanticipate-01', 'Ġout-03', 'Ġration-01', 'Ġcount-02', 'Ġconsistent-01', 'Ġawait-01',
                      'Ġschool-01', 'Ġrent-01', 'Ġarise-02', 'Ġappeal-03', 'Ġhelpful-04', 'Ġsee-03', 'Ġlock-01',
                      'Ġstereotype-01', 'Ġjoin-in-05', 'Ġscrew-up-01', 'Ġwithhold-01', 'Ġmoderate-01', 'Ġaffiliate-01',
                      'Ġwaive-01', 'Ġsuck-01', 'Ġgolf-01', 'Ġturn-out-17', 'Ġput-up-11', 'Ġkeep-up-05', 'Ġstraight-05',
                      'Ġdress-01', 'Ġdig-01', 'Ġplead-02', 'Ġlecture-01', 'Ġgo-09', 'Ġpervert-01', 'Ġcry-01',
                      'Ġmitigate-01', 'Ġsubstitute-01', 'Ġsend-02', 'Ġdown-01', 'Ġwesternize-01', 'Ġcolor-01',
                      'Ġrefer-03', 'Ġpersecute-01', 'Ġscheme-01', 'Ġreactionary-02', 'Ġsubscribe-01', 'Ġshield-01',
                      'Ġexile-01', 'Ġdetonate-01', 'Ġstall-01', 'Ġbroker-01', 'Ġcalculate-01', 'Ġnarrow-02',
                      'Ġstock-01', 'Ġturn-down-05', 'Ġparole-01', 'Ġjoin-04', 'Ġinstitute-01', 'Ġdisprove-01',
                      'Ġpass-20', 'Ġspew-01', 'Ġbid-03', 'Ġwage-01', 'Ġsample-01', 'Ġretail-01', 'Ġratify-01',
                      'Ġspank-01', 'Ġdispatch-01', 'Ġharvest-01', 'Ġrot-01', 'Ġdelude-01', 'Ġclimb-01', 'Ġfrighten-01',
                      'Ġyell-01', 'Ġcoerce-01', 'Ġscary-03', 'Ġstretch-01', 'Ġdestabilize-01', 'Ġblood-02',
                      'Ġconfine-01', 'Ġoutrageous-02', 'Ġbeg-01', 'Ġwield-01', 'Ġscrap-01', 'Ġprivatize-01', 'Ġcure-01',
                      'Ġmature-02', 'Ġcoexist-01', 'Ġassert-02', 'Ġget-along-18', 'Ġreunify-01', 'Ġlook-forward-03',
                      'Ġnumber-01', 'Ġtrash-01', 'Ġrun-04', 'Ġgive-up-08', 'Ġbright-02', 'Ġout-01', 'Ġheal-01',
                      'Ġmassacre-01', 'Ġtackle-01', 'Ġstake-01', 'Ġopen-09', 'Ġknow-04', 'Ġcorrespond-02',
                      'Ġdisregard-01', 'Ġalienate-01', 'Ġinsure-01', 'Ġdisapprove-01', 'Ġdrain-01', 'Ġdeflect-01',
                      'Ġexit-01', 'Ġvacation-01', 'Ġcook-01', 'Ġadapt-01', 'Ġdissolve-01', 'Ġlift-01', 'Ġclose-down-04',
                      'Ġcome-down-23', 'Ġbully-01', 'Ġdenounce-01', 'Ġstab-01', 'Ġexpel-01', 'Ġabstain-01',
                      'Ġcut-out-06', 'Ġswallow-01', 'Ġcome-in-07', 'Ġstep-in-02', 'Ġseek-out-02', 'Ġpace-01', 'Ġwed-01',
                      'Ġgo-on-25', 'Ġsave-03', 'Ġcome-up-13', 'Ġsort-out-02', 'Ġtattoo-01', 'Ġleave-out-03', 'Ġkiss-01',
                      'Ġchance-01', 'Ġprolong-01', 'Ġtroll-01', 'Ġconcentrate-01', 'Ġchannel-01', 'Ġrecreation-02',
                      'Ġcenter-01', 'Ġweaponize-01', 'Ġexplicit-03', 'Ġdraft-02', 'Ġpose-02', 'Ġcrush-01',
                      'Ġdiscredit-01', 'Ġfurther-01', 'Ġdedicate-01', 'Ġsit-down-02', 'Ġleave-10', 'Ġforge-02',
                      'Ġcensor-01', 'Ġparade-02', 'Ġpaint-02', 'Ġcatch-03', 'Ġremortgage-01', 'Ġslow-down-03',
                      'Ġadmit-02', 'Ġbreak-19', 'Ġcounterfeit-01', 'Ġrun-10', 'Ġupgrade-01', 'Ġdeduct-01',
                      'Ġconfess-01', 'Ġdecline-02', 'Ġbar-01', 'Ġbrief-01', 'Ġconduct-02', 'Ġlynch-01', 'Ġacquit-01',
                      'Ġhyperlink-01', 'Ġlight-04', 'Ġconcrete-02', 'Ġreach-02', 'Ġmarch-01', 'Ġpurport-01',
                      'Ġcall-on-05', 'Ġpaddle-01', 'Ġfilter-02', 'Ġstrip-01', 'Ġcompose-01', 'Ġerupt-01', 'Ġwipe-01',
                      'Ġtrace-02', 'Ġdespise-01', 'Ġminimize-01', 'Ġneglect-01', 'Ġloyal-01', 'Ġslip-01', 'Ġrevive-01',
                      'Ġwork-07', 'Ġbeat-up-05', 'Ġdetermined-02', 'Ġpass-07', 'Ġprescribe-02', 'Ġfuss-01',
                      'Ġdemolish-01', 'Ġavail-01', 'Ġput-in-05', 'Ġlease-01', 'Ġembrace-01', 'Ġmerit-01',
                      'Ġintensify-01', 'Ġhearing-02', 'Ġweaken-01', 'Ġcolonize-01', 'Ġoffset-01', 'Ġgather-01',
                      'Ġtake-off-07', 'Ġbright-03', 'Ġextend-02', 'Ġget-30', 'Ġpreexist-01', 'Ġsnow-01', 'Ġstrike-02',
                      'Ġgross-06', 'Ġdiminish-01', 'Ġprejudice-01', 'Ġrage-02', 'Ġnotify-01', 'Ġcontest-02', 'Ġhype-01',
                      'Ġrevisit-01', 'Ġdark-02', 'Ġstand-08', 'Ġcertify-01', 'Ġoversee-01', 'Ġname-02', 'Ġlock-up-03',
                      'Ġknow-03', 'Ġminimal-02', 'Ġtell-02', 'Ġrotate-01', 'Ġoperate-02', 'Ġfat-03', 'Ġindulge-01',
                      'Ġfeel-06', 'Ġset-08', 'Ġsurpass-01', 'Ġpull-06', 'Ġget-06', 'Ġcamp-02', 'Ġgut-01', 'Ġchair-01',
                      'Ġqualify-01', 'Ġspare-01', 'Ġblunt-02', 'Ġproceed-01', 'Ġdump-01', 'Ġreckon-01', 'Ġpierce-01',
                      'Ġmelt-01', 'Ġfeel-05', 'Ġstand-03', 'Ġelaborate-01', 'Ġreach-03', 'Ġspark-01', 'Ġcoincide-01',
                      'Ġslander-01', 'Ġjoin-up-02', 'Ġshame-01', 'Ġboard-01', 'Ġrule-out-02', 'Ġblockade-01',
                      'Ġincinerate-01', 'Ġderive-01', 'Ġget-by-17', 'Ġcharacterize-01', 'Ġstockpile-01', 'Ġpersuade-01',
                      'Ġdecapitate-01', 'Ġrun-08', 'Ġpack-01', 'Ġbust-01', 'Ġpolice-01', 'Ġtrick-01', 'Ġblast-05',
                      'Ġtreat-04', 'Ġrun-off-24', 'Ġapprentice-01', 'Ġdispose-01', 'Ġinhibit-01', 'Ġwire-01', 'Ġtop-01',
                      'Ġhand-over-02', 'Ġknow-06', 'Ġabet-01', 'Ġcatch-up-04', 'Ġsleep-02', 'Ġslam-02', 'Ġbreed-01',
                      'Ġcontend-02', 'Ġperjure-01', 'Ġmanipulate-01', 'Ġprobe-01', 'Ġtrend-01', 'Ġtighten-01',
                      'Ġboycott-01', 'Ġtable-01', 'Ġindoctrinate-01', 'Ġsafeguard-01', 'Ġevacuate-01', 'Ġinterdict-01',
                      'Ġpetition-01', 'Ġformulate-01', 'Ġpartake-01', 'Ġpass-04', 'Ġoverride-01', 'Ġemit-01',
                      'Ġcharacteristic-02', 'Ġtimely-03', 'Ġstun-01', 'Ġcrumble-01', 'Ġmaximize-01', 'Ġpass-away-16',
                      'Ġrun-07', 'Ġsmile-01', 'Ġinquire-01', 'Ġlag-01', 'Ġlive-up-04', 'Ġdistance-01', 'Ġcold-02',
                      'Ġdeep-03', 'Ġrelax-01', 'Ġill-02', 'Ġsignify-01', 'Ġhold-back-07', 'Ġtransplant-01', 'Ġsmoke-01',
                      'Ġcurb-01', 'Ġdelegate-01', 'Ġseal-01', 'Ġlure-01', 'Ġintimate-02', 'Ġfresh-04', 'Ġseat-01',
                      'Ġmove-03', 'Ġkeep-03', 'Ġoutweigh-01', 'Ġrevere-01', 'Ġclone-01', 'Ġenlist-01', 'Ġclick-01',
                      'Ġempty-02', 'Ġfire-04', 'Ġcontend-01', 'Ġabide-01', 'Ġcraft-01', 'Ġtip-05', 'Ġwrap-01',
                      'Ġbite-01', 'Ġtoss-01', 'Ġpolite-01', 'Ġdesirable-02', 'Ġdefuse-01', 'Ġthrill-01', 'Ġproduce-02',
                      'Ġoblige-02', 'Ġdate-02', 'Ġalternate-01', 'Ġget-on-21', 'Ġramble-02', 'Ġhurt-02', 'Ġdistant-02',
                      'Ġhot-05', 'Ġpale-03', 'Ġproclaim-01', 'Ġclass-01', 'Ġcome-across-21', 'Ġsneak-01', 'Ġerode-01',
                      'Ġchampion-01', 'Ġneutral-02', 'Ġalien-01', 'Ġgrieve-01', 'Ġswear-01', 'Ġgo-21',
                      'Ġunderestimate-01', 'Ġaddictive-02', 'Ġpropagate-01', 'Ġlast-04', 'Ġcommence-01', 'Ġair-01',
                      'Ġmark-02', 'Ġaccommodate-01', 'Ġdemonize-01', 'Ġmock-01', 'Ġnuke-01', 'Ġswell-01', 'Ġbrag-01',
                      'Ġassert-03', 'Ġdisrespect-01', 'Ġwork-12', 'Ġremarkable-02', 'Ġpool-01', 'Ġpaint-03', 'Ġpour-01',
                      'Ġdecommission-01', 'Ġamplify-01', 'Ġmad-02', 'Ġcorrelate-01', 'Ġautomate-01', 'Ġmoney-01',
                      'Ġcontent-02', 'Ġstorm-01', 'Ġthrive-01', 'Ġliable-01', 'Ġhopeful-02', 'Ġexpire-01', 'Ġwork-06',
                      'Ġdisperse-01', 'Ġlay-04', 'Ġfall-apart-09', 'Ġterror-02', 'Ġphilander-01', 'Ġscrutinize-01',
                      'Ġfathom-01', 'Ġmake-up-08', 'Ġhumiliate-01', 'Ġcharge-06', 'Ġnatural-02', 'Ġfollow-up-03',
                      'Ġbend-01', 'Ġgrade-01', 'Ġenter-02', 'Ġpend-01', 'Ġprey-01', 'Ġmediate-01', 'Ġconclude-02',
                      'Ġmask-01', 'Ġreactivate-01', 'Ġevolve-02', 'Ġrestart-01', 'Ġencrypt-01', 'Ġget-through-12',
                      'Ġgrow-02', 'Ġbestow-01', 'Ġput-out-10', 'Ġdisplace-01', 'Ġcount-03', 'Ġstabilize-01',
                      'Ġembezzle-01', 'Ġpass-on-09', 'Ġform-02', 'Ġroot-02', 'Ġtrample-01', 'Ġmake-out-23',
                      'Ġfit-in-02', 'Ġhospitalize-01', 'Ġcut-down-11', 'Ġconstrain-01', 'Ġclash-01', 'Ġconsolidate-01',
                      'Ġmeddle-01', 'Ġreproduce-01', 'Ġclever-01', 'Ġdiversify-01', 'Ġpostpone-01', 'Ġstructure-01',
                      'Ġnarrow-01', 'Ġincur-01', 'Ġdraw-up-03', 'Ġdrive-04', 'Ġpin-01', 'Ġdelight-01', 'Ġput-on-08',
                      'Ġcoverage-06', 'Ġbring-about-05', 'Ġstir-up-04', 'Ġlet-down-04', 'Ġsigh-02', 'Ġspace-01',
                      'Ġcheat-02', 'Ġlessen-01', 'Ġrender-02', 'Ġrender-01', 'Ġmenace-01', 'Ġprevail-01', 'Ġreclaim-01',
                      'Ġpuzzle-01', 'Ġhesitate-01', 'Ġgo-23', 'Ġcharm-01', 'Ġturn-over-12', 'Ġwander-01',
                      'Ġrenovate-01', 'Ġpackage-01', 'Ġheadquarter-01', 'Ġline-01', 'Ġstraight-06', 'Ġpark-01',
                      'Ġturn-on-13', 'Ġarbitrary-02', 'Ġconceive-01', 'Ġexert-01', 'Ġspell-01', 'Ġdye-01', 'Ġtune-01',
                      'Ġrip-01', 'Ġgarner-01', 'Ġsick-04', 'Ġshove-01', 'Ġwave-01', 'Ġrust-01', 'Ġkneel-01',
                      'Ġcelebrate-01', 'Ġmisrepresent-01', 'Ġincarcerate-01', 'Ġawake-03', 'Ġup-01', 'Ġslip-02',
                      'Ġconcentrate-02', 'Ġround-05', 'Ġloose-04', 'Ġcripple-01', 'Ġpart-01', 'Ġhoard-01', 'Ġchain-01',
                      'Ġtricky-02', 'Ġhook-up-02', 'Ġtype-01', 'Ġglance-01', 'Ġprize-01', 'Ġtransmit-01', 'Ġhold-03',
                      'Ġsurge-01', 'Ġheadline-01', 'Ġvote-02', 'Ġdraw-01', 'Ġtext-01', 'Ġshower-01', 'Ġcalm-down-02',
                      'Ġfeed-up-03', 'Ġslide-01', 'Ġgo-down-27', 'Ġforward-01', 'Ġproject-02', 'Ġempower-01',
                      'Ġmind-04', 'Ġpass-02', 'Ġneutralize-01', 'Ġrepress-01', 'Ġserve-04', 'Ġeye-01',
                      'Ġdiscriminate-01', 'Ġoverlook-01', 'Ġtop-02', 'Ġmobilize-01', 'Ġstart-out-05', 'Ġpunishable-02',
                      'Ġunderlie-01', 'Ġpenetrate-01', 'Ġgrind-01', 'Ġjump-01', 'Ġpertain-01', 'Ġincline-01',
                      'Ġhumble-01', 'Ġmoderate-02', 'Ġmeaningful-05', 'Ġmislead-01', 'Ġfinish-07', 'Ġdisgruntle-01',
                      'Ġturn-up-15', 'Ġknock-01', 'Ġtake-03', 'Ġlunch-01', 'Ġadd-03', 'Ġcommend-01', 'Ġpatient-01',
                      'Ġattain-01', 'Ġhike-02', 'Ġlurk-01', 'Ġbe-02', 'Ġblackmail-01', 'Ġdubious-02', 'Ġentrench-01',
                      'Ġget-off-23', 'Ġflame-01', 'Ġstand-02', 'Ġsurvive-02', 'Ġafford-02', 'Ġlive-02', 'Ġmoan-01',
                      'Ġportion-01', 'Ġslash-02', 'Ġbreak-through-22', 'Ġplague-01', 'Ġblunt-01', 'Ġabominable-02',
                      'Ġhonorable-03', 'Ġrelated-04', 'Ġdeprive-01', 'Ġdecay-01', 'Ġdistress-01', 'Ġredistribute-01',
                      'Ġforeclose-01', 'Ġwarm-06', 'Ġjealous-02', 'Ġcohere-01', 'Ġpaste-01', 'Ġprompt-01',
                      'Ġcurtail-01', 'Ġtrack-down-02', 'Ġpity-01', 'Ġticket-02', 'Ġtransition-01', 'Ġburst-02',
                      'Ġbroke-23', 'Ġrewrite-01', 'Ġdeliberate-01', 'Ġdisclose-01', 'Ġsituate-01', 'Ġreiterate-01',
                      'Ġprofess-01', 'Ġbabble-01', 'Ġlift-02', 'Ġdeclassify-01', 'Ġremand-01', 'Ġreconcile-01',
                      'Ġassemble-01', 'Ġextort-01', 'Ġcorroborate-01', 'Ġsnip-01', 'Ġnormalize-01', 'Ġclose-03',
                      'Ġremit-01', 'Ġsweep-06', 'Ġbreach-01', 'Ġbehead-01', 'Ġsimulate-01', 'Ġastonish-01',
                      'Ġdeviate-01', 'Ġsmear-02', 'Ġgive-away-02', 'Ġdifferentiate-01', 'Ġintersect-01', 'Ġrectify-01',
                      'Ġlose-out-06', 'Ġtelephone-01', 'Ġrevolutionary-04', 'Ġblow-14', 'Ġexaggerate-01', 'Ġsoar-01',
                      'Ġcontent-01', 'Ġpreside-01', 'Ġcheck-07', 'Ġrefrain-01', 'Ġcrack-02', 'Ġdisintegrate-01',
                      'Ġexterminate-01', 'Ġridicule-01', 'Ġobey-01', 'Ġbundle-01', 'Ġcompound-01', 'Ġwine-01',
                      'Ġdine-01', 'Ġresent-01', 'Ġjeopardize-01', 'Ġusher-in-01', 'Ġcrowd-01', 'Ġelevate-01',
                      'Ġtear-down-05', 'Ġresolve-02', 'Ġearnest-01', 'Ġirritate-01', 'Ġgreen-02', 'Ġheed-01',
                      'Ġplay-10', 'Ġspread-out-04', 'Ġcruise-01', 'Ġcater-01', 'Ġstay-on-02', 'Ġstick-around-03',
                      'Ġcall-13', 'Ġbicker-01', 'Ġcurse-02', 'Ġopen-07', 'Ġrun-up-19', 'Ġtrump-01', 'Ġhappy-02',
                      'Ġredeem-01', 'Ġstrike-04', 'Ġbring-on-06', 'Ġenlighten-01', 'Ġgray-02', 'Ġnote-02', 'Ġshred-01',
                      'Ġgas-03', 'Ġlevy-01', 'Ġturn-18', 'Ġlevel-04', 'Ġbow-01', 'Ġturn-14', 'Ġrehabilitate-01',
                      'Ġcouple-01', 'Ġdent-01', 'Ġcautious-02', 'Ġbust-02', 'Ġshut-01', 'Ġflip-01', 'Ġvalidate-01',
                      'Ġkill-03', 'Ġhot-04', 'Ġchat-01', 'Ġcurious-02', 'Ġlump-01', 'Ġexacerbate-01', 'Ġsneaky-03',
                      'Ġconviction-02', 'Ġproceeding-02', 'Ġreorganize-01', 'Ġfit-05', 'Ġsee-05', 'Ġacquaint-01',
                      'Ġvile-02', 'Ġzap-01', 'Ġuniform-01', 'Ġreplicate-01', 'Ġintent-02', 'Ġgrip-01', 'Ġswear-02',
                      'Ġdecry-01', 'Ġsegregate-01', 'Ġspur-01', 'Ġstorm-02', 'Ġcap-02', 'Ġslant-01', 'Ġspan-01',
                      'Ġcut-back-05', 'Ġfledge-01', 'Ġfoster-01', 'Ġgripe-01', 'Ġquest-01', 'Ġpunch-01',
                      'Ġderegulate-01', 'Ġloathe-01', 'Ġimitate-01', 'Ġhang-out-06', 'Ġbaffle-01', 'Ġsuck-up-04',
                      'Ġtempt-01', 'Ġcondone-01', 'Ġassemble-02', 'Ġoust-01', 'Ġvent-01', 'Ġspout-01', 'Ġsound-02',
                      'Ġevade-01', 'Ġendure-01', 'Ġinvoke-01', 'Ġdevalue-01', 'Ġpose-01', 'Ġbear-06', 'Ġhypothesize-01',
                      'Ġspot-01', 'Ġdiscount-02', 'Ġrail-01', 'Ġhaul-01', 'Ġgauge-01', 'Ġcopyright-01', 'Ġgive-in-09',
                      'Ġimpede-01', 'Ġblast-01', 'Ġtrue-02', 'Ġbeware-01', 'Ġrestore-02', 'Ġnegative-05', 'Ġsteady-01',
                      'Ġfluctuate-01', 'Ġdate-01', 'Ġbathe-01', 'Ġgo-22', 'Ġrestructure-01', 'Ġpile-01', 'Ġspin-01',
                      'Ġtake-down-22', 'Ġbake-01', 'Ġtriple-01', 'Ġdowngrade-02', 'Ġordain-01', 'Ġmultiply-01',
                      'Ġskip-01', 'Ġincorporate-02', 'Ġsettle-01', 'Ġpass-on-14', 'Ġcreepy-04', 'Ġstuff-01',
                      'Ġline-up-02', 'Ġimmune-02', 'Ġlust-01', 'Ġnotable-04', 'Ġbuy-into-04', 'Ġimpair-01',
                      'Ġfigure-04', 'Ġpiss-01', 'Ġgive-back-03', 'Ġboast-01', 'Ġlay-off-06', 'Ġdive-01', 'Ġcommute-02',
                      'Ġracket-02', 'Ġdip-01', 'Ġrotate-02', 'Ġdemagogue-01', 'Ġchange-02', 'Ġbarter-01', 'Ġalike-05',
                      'Ġbind-03', 'Ġwhip-up-03', 'Ġmanifest-01', 'Ġcheck-03', 'Ġyield-01', 'Ġslay-01', 'Ġtally-01',
                      'Ġget-through-13', 'Ġbreak-through-26', 'Ġrelinquish-01', 'Ġreopen-01', 'Ġdefame-01',
                      'Ġinterrupt-01', 'Ġcast-03', 'Ġpattern-01', 'Ġdose-01', 'Ġreenter-01', 'Ġmotivate-02',
                      'Ġstandardize-01', 'Ġdate-entity', 'Ġgovernment-organization', 'Ġtemporal-quantity',
                      'Ġamr-unknown', 'Ġmulti-sentence', 'Ġpolitical-party', 'Ġ:compared-to', 'Ġmonetary-quantity',
                      'Ġordinal-entity', 'Ġreligious-group', 'Ġpercentage-entity', 'Ġworld-region', 'Ġ:consist',
                      'Ġurl-entity', 'Ġpolitical-movement', 'Ġet-cetera', 'Ġat-least', 'Ġmass-quantity',
                      'Ġhave-org-role-91', 'Ġhave-rel-role-91', 'Ġinclude-91', 'Ġhave-concession-91',
                      'Ġhave-condition-91', 'Ġbe-located-at-91', 'Ġrate-entity-91', 'Ġinstead-of-91', 'Ġhyperlink-91',
                      'Ġrequest-confirmation-91', 'Ġhave-purpose-91', 'Ġbe-temporally-at-91', 'Ġregardless-91',
                      'Ġhave-polarity-91', 'Ġbyline-91', 'Ġhave-manner-91', 'Ġhave-part-91', 'Ġhave-quant-91',
                      'Ġpublication-91', 'Ġbe-from-91', 'Ġhave-mod-91', 'Ġhave-frequency-91', 'Ġscore-on-scale-91',
                      'Ġhave-li-91', 'Ġbe-compared-to-91', 'Ġbe-destined-for-91', 'Ġcourse-91', 'Ġhave-subevent-91',
                      'Ġstreet-address-91', 'Ġhave-extent-91', 'Ġstatistical-test-91', 'Ġhave-instrument-91',
                      'Ġhave-name-91', 'Ġbe-polite-91', '-00', '-01', '-02', '-03', '-04', '-05', '-06', '-07', '-08',
                      '-09', '-10', '-11', '-12', '-13', '-14', '-15', '-16', '-17', '-18', '-19', '-20', '-21', '-22',
                      '-23', '-24', '-25', '-26', '-27', '-28', '-29', '-20', '-31', '-32', '-33', '-34', '-35', '-36',
                      '-37', '-38', '-39', '-40', '-41', '-42', '-43', '-44', '-45', '-46', '-47', '-48', '-49', '-50',
                      '-51', '-52', '-53', '-54', '-55', '-56', '-57', '-58', '-59', '-60', '-61', '-62', '-63', '-64',
                      '-65', '-66', '-67', '-68', '-69', '-70', '-71', '-72', '-73', '-74', '-75', '-76', '-77', '-78',
                      '-79', '-80', '-81', '-82', '-83', '-84', '-85', '-86', '-87', '-88', '-89', '-90', '-91', '-92',
                      '-93', '-94', '-95', '-96', '-97', '-98', '-of', 'Ġ:op1', 'Ġ:op2', 'Ġ:op3', 'Ġ:op4', 'Ġ:op5',
                      'Ġ:ARG0', 'Ġ:ARG1', 'Ġ:ARG2', 'Ġ:ARG3', 'Ġ:ARG4', 'Ġ:ARG5', 'Ġ:ARG6', 'Ġ:ARG7', 'Ġ:ARG8',
                      'Ġ:ARG9', 'Ġ:ARG10', 'Ġ:ARG11', 'Ġ:ARG12', 'Ġ:ARG13', 'Ġ:ARG14', 'Ġ:ARG15', 'Ġ:ARG16', 'Ġ:ARG17',
                      'Ġ:ARG18', 'Ġ:ARG19', 'Ġ:ARG20', 'Ġ:accompanier', 'Ġ:age', 'Ġ:beneficiary', 'Ġ:calendar',
                      'Ġ:cause', 'Ġ:century', 'Ġ:concession', 'Ġ:condition', 'Ġ:conj-as-if', 'Ġ:consist-of', 'Ġ:cost',
                      'Ġ:day', 'Ġ:dayperiod', 'Ġ:decade', 'Ġ:degree', 'Ġ:destination', 'Ġ:direction', 'Ġ:domain',
                      'Ġ:duration', 'Ġ:employed-by', 'Ġ:era', 'Ġ:example', 'Ġ:extent', 'Ġ:frequency', 'Ġ:instrument',
                      'Ġ:li', 'Ġ:location', 'Ġ:manner', 'Ġ:meaning', 'Ġ:medium', 'Ġ:mod', 'Ġ:mode', 'Ġ:month', 'Ġ:name',
                      'Ġ:ord', 'Ġ:part', 'Ġ:path', 'Ġ:polarity', 'Ġ:polite', 'Ġ:poss', 'Ġ:purpose', 'Ġ:quant',
                      'Ġ:quarter', 'Ġ:range', 'Ġ:relation', 'Ġ:role', 'Ġ:scale', 'Ġ:season', 'Ġ:source', 'Ġ:subevent',
                      'Ġ:subset', 'Ġ:superset', 'Ġ:time', 'Ġ:timezone', 'Ġ:topic', 'Ġ:unit', 'Ġ:value', 'Ġ:weekday',
                      'Ġ:wiki', 'Ġ:year', 'Ġ:year2', 'Ġ:snt0', 'Ġ:snt1', 'Ġ:snt2', 'Ġ:snt3', 'Ġ:snt4', 'Ġ:snt5',
                      'ĠCOUNTRY', 'ĠQUANTITY', 'ĠORGANIZATION', 'ĠDATE_ATTRS', 'ĠNATIONALITY', 'ĠLOCATION', 'ĠENTITY',
                      'ĠMISC', 'ĠORDINAL_ENTITY', 'ĠIDEOLOGY', 'ĠRELIGION', 'ĠSTATE_OR_PROVINCE', 'ĠCAUSE_OF_DEATH',
                      'ĠTITLE', 'ĠDATE', 'ĠNUMBER', 'ĠHANDLE', 'ĠSCORE_ENTITY', 'ĠDURATION', 'ĠORDINAL', 'ĠMONEY',
                      'ĠCRIMINAL_CHARGE', '_1', '_2', '_3', '_4', '_2', '_5', '_6', '_7', '_8', '_9', '_10', '_11',
                      '_12', '_13', '_14', '_15', 'Ġ<pointer:0>', 'Ġ<pointer:1>', 'Ġ<pointer:2>', 'Ġ<pointer:3>',
                      'Ġ<pointer:4>', 'Ġ<pointer:5>', 'Ġ<pointer:6>', 'Ġ<pointer:7>', 'Ġ<pointer:8>', 'Ġ<pointer:9>',
                      'Ġ<pointer:10>', 'Ġ<pointer:11>', 'Ġ<pointer:12>', 'Ġ<pointer:13>', 'Ġ<pointer:14>',
                      'Ġ<pointer:15>', 'Ġ<pointer:16>', 'Ġ<pointer:17>', 'Ġ<pointer:18>', 'Ġ<pointer:19>',
                      'Ġ<pointer:20>', 'Ġ<pointer:21>', 'Ġ<pointer:22>', 'Ġ<pointer:23>', 'Ġ<pointer:24>',
                      'Ġ<pointer:25>', 'Ġ<pointer:26>', 'Ġ<pointer:27>', 'Ġ<pointer:28>', 'Ġ<pointer:29>',
                      'Ġ<pointer:30>', 'Ġ<pointer:31>', 'Ġ<pointer:32>', 'Ġ<pointer:33>', 'Ġ<pointer:34>',
                      'Ġ<pointer:35>', 'Ġ<pointer:36>', 'Ġ<pointer:37>', 'Ġ<pointer:38>', 'Ġ<pointer:39>',
                      'Ġ<pointer:40>', 'Ġ<pointer:41>', 'Ġ<pointer:42>', 'Ġ<pointer:43>', 'Ġ<pointer:44>',
                      'Ġ<pointer:45>', 'Ġ<pointer:46>', 'Ġ<pointer:47>', 'Ġ<pointer:48>', 'Ġ<pointer:49>',
                      'Ġ<pointer:50>', 'Ġ<pointer:51>', 'Ġ<pointer:52>', 'Ġ<pointer:53>', 'Ġ<pointer:54>',
                      'Ġ<pointer:55>', 'Ġ<pointer:56>', 'Ġ<pointer:57>', 'Ġ<pointer:58>', 'Ġ<pointer:59>',
                      'Ġ<pointer:60>', 'Ġ<pointer:61>', 'Ġ<pointer:62>', 'Ġ<pointer:63>', 'Ġ<pointer:64>',
                      'Ġ<pointer:65>', 'Ġ<pointer:66>', 'Ġ<pointer:67>', 'Ġ<pointer:68>', 'Ġ<pointer:69>',
                      'Ġ<pointer:70>', 'Ġ<pointer:71>', 'Ġ<pointer:72>', 'Ġ<pointer:73>', 'Ġ<pointer:74>',
                      'Ġ<pointer:75>', 'Ġ<pointer:76>', 'Ġ<pointer:77>', 'Ġ<pointer:78>', 'Ġ<pointer:79>',
                      'Ġ<pointer:80>', 'Ġ<pointer:81>', 'Ġ<pointer:82>', 'Ġ<pointer:83>', 'Ġ<pointer:84>',
                      'Ġ<pointer:85>', 'Ġ<pointer:86>', 'Ġ<pointer:87>', 'Ġ<pointer:88>', 'Ġ<pointer:89>',
                      'Ġ<pointer:90>', 'Ġ<pointer:91>', 'Ġ<pointer:92>', 'Ġ<pointer:93>', 'Ġ<pointer:94>',
                      'Ġ<pointer:95>', 'Ġ<pointer:96>', 'Ġ<pointer:97>', 'Ġ<pointer:98>', 'Ġ<pointer:99>',
                      'Ġ<pointer:100>', 'Ġ<pointer:101>', 'Ġ<pointer:102>', 'Ġ<pointer:103>', 'Ġ<pointer:104>',
                      'Ġ<pointer:105>', 'Ġ<pointer:106>', 'Ġ<pointer:107>', 'Ġ<pointer:108>', 'Ġ<pointer:109>',
                      'Ġ<pointer:110>', 'Ġ<pointer:111>', 'Ġ<pointer:112>', 'Ġ<pointer:113>', 'Ġ<pointer:114>',
                      'Ġ<pointer:115>', 'Ġ<pointer:116>', 'Ġ<pointer:117>', 'Ġ<pointer:118>', 'Ġ<pointer:119>',
                      'Ġ<pointer:120>', 'Ġ<pointer:121>', 'Ġ<pointer:122>', 'Ġ<pointer:123>', 'Ġ<pointer:124>',
                      'Ġ<pointer:125>', 'Ġ<pointer:126>', 'Ġ<pointer:127>', 'Ġ<pointer:128>', 'Ġ<pointer:129>',
                      'Ġ<pointer:130>', 'Ġ<pointer:131>', 'Ġ<pointer:132>', 'Ġ<pointer:133>', 'Ġ<pointer:134>',
                      'Ġ<pointer:135>', 'Ġ<pointer:136>', 'Ġ<pointer:137>', 'Ġ<pointer:138>', 'Ġ<pointer:139>',
                      'Ġ<pointer:140>', 'Ġ<pointer:141>', 'Ġ<pointer:142>', 'Ġ<pointer:143>', 'Ġ<pointer:144>',
                      'Ġ<pointer:145>', 'Ġ<pointer:146>', 'Ġ<pointer:147>', 'Ġ<pointer:148>', 'Ġ<pointer:149>',
                      'Ġ<pointer:150>', 'Ġ<pointer>', 'Ġ<stop>', 'Ġ<lit>', 'Ġ</lit>', 'Ġ<backr:src:XXX>',
                      'Ġ<backr:trg:XXX>', '<AMR>', '</AMR>']
special_tokens = [itm.lstrip("Ġ") for itm in raw_special_tokens]

recategorizations = [
    "\u0120COUNTRY",
    "\u0120QUANTITY",
    "\u0120ORGANIZATION",
    "\u0120DATE_ATTRS",
    "\u0120NATIONALITY",
    "\u0120LOCATION",
    "\u0120ENTITY",
    "\u0120MISC",
    "\u0120ORDINAL_ENTITY",
    "\u0120IDEOLOGY",
    "\u0120RELIGION",
    "\u0120STATE_OR_PROVINCE",
    "\u0120CAUSE_OF_DEATH",
    "\u0120TITLE",
    "\u0120DATE",
    "\u0120NUMBER",
    "\u0120HANDLE",
    "\u0120SCORE_ENTITY",
    "\u0120DURATION",
    "\u0120ORDINAL",
    "\u0120MONEY",
    "\u0120CRIMINAL_CHARGE",
]

# special_tokens = ["<AMR>", "</AMR>"]

arg_to_scheduler = {
    "linear": get_linear_schedule_with_warmup,
    "cosine": get_cosine_schedule_with_warmup,
    "cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
    "polynomial": get_polynomial_decay_schedule_with_warmup,
    "constant": get_constant_schedule_with_warmup,
}
arg_to_scheduler_choices = sorted(arg_to_scheduler.keys())
arg_to_scheduler_metavar = "{" + ", ".join(arg_to_scheduler_choices) + "}"

ROUGE_KEYS = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

arg_to_tokenizer = {
    "AutoTokenizer": AutoTokenizer,
    "BartTokenizer": BartTokenizer,
    "T5Tokenizer": T5Tokenizer,
}
arg_to_plm_model = {
    "AutoModelForSeq2SeqLM": AutoModelForSeq2SeqLM,
    "BartForConditionalGeneration": BartForConditionalGeneration,
    "T5Model": T5Model,
    "T5ForConditionalGeneration": T5ForConditionalGeneration,
}


================================================
FILE: hanlp/components/amr/amrbart/common/penman_interface.py
================================================
# coding:utf-8
# MIT License
#
# Copyright (c) 2022 xfbai
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from penman import load as load_, Graph, Triple
from penman import loads as loads_
from penman import encode as encode_
from penman.model import Model
from penman.models.noop import NoOpModel
from penman.models import amr

op_model = Model()
noop_model = NoOpModel()
amr_model = amr.model
DEFAULT = op_model


def _get_model(dereify):
    if dereify is None:
        return DEFAULT

    elif dereify:
        return op_model

    else:
        return noop_model


def _remove_wiki(graph):
    metadata = graph.metadata
    triples = []
    for t in graph.triples:
        v1, rel, v2 = t
        if rel == ":wiki":
            t = Triple(v1, rel, "+")
        triples.append(t)
    graph = Graph(triples)
    graph.metadata = metadata
    return graph


def load(source, dereify=None, remove_wiki=False):
    model = _get_model(dereify)
    out = load_(source=source, model=model)
    if remove_wiki:
        for i in range(len(out)):
            out[i] = _remove_wiki(out[i])
    return out


def loads(string, dereify=None, remove_wiki=False):
    model = _get_model(dereify)
    out = loads_(string=string, model=model)
    if remove_wiki:
        for i in range(len(out)):
            out[i] = _remove_wiki(out[i])
    return out


def encode(g, top=None, indent=-1, compact=False):
    model = amr_model
    return encode_(g=g, top=top, indent=indent, compact=compact, model=model)


================================================
FILE: hanlp/components/amr/amrbart/common/postprocessing.py
================================================
# coding:utf-8
# MIT License
#
# Copyright (c) 2022 xfbai
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import re
import enum
import penman
import networkx as nx
from hanlp.components.amr.amrbart.common.penman_interface import encode
from collections import defaultdict, Counter

BACKOFF = penman.Graph(
    [
        penman.Triple("d2", ":instance", "dog"),
        penman.Triple("b1", ":instance", "bark-01"),
        penman.Triple("b1", ":ARG0", "d2"),
    ]
)


def token_processing(tok):
    if tok is None:
        return None
    elif tok.isdigit():
        try:
            return eval(tok)
        except:
            return tok
    elif tok.startswith('"') and (not tok.endswith('"')):
        return tok + '"'
    elif tok.endswith('"') and (not tok.startswith('"')):
        return '"' + tok
    else:
        return tok


def decode_into_node_and_backreferences(subtoken_ids, tokenizer):
    rex_arg = re.compile(f"^{tokenizer.INIT}(op|snt|conj|prep)")
    rex_spc = re.compile(r"<(s|/s|lit|/lit|stop|unk|pad|mask)>")
    
    # subtoken_ids.insert(1,36)           # add "(" id
    # subtoken_ids.insert(-1, 4839)       # add ")" id

    # get strings
    subtokens = [tokenizer.decoder.get(t) for t in subtoken_ids]
    # print("subtokens:", subtokens)
    # fix backreferences
    
    subtoken_backreferences = [max(t - len(tokenizer.encoder), -1) for t in subtoken_ids]
    # strip padding
    subtokens, subtoken_backreferences = zip(
        *[
            (s, b)
            for s, b in zip(subtokens, subtoken_backreferences)
            if s != ("<pad>")
        ]
    )

    # subword collapse
    tokens = []
    backreferences = []
    subword_to_token_map = {}
    current_token_i = 0
    for subw_i, (subw_backr, subtok) in enumerate(zip(subtoken_backreferences, subtokens)):
        subword_to_token_map[subw_i] = current_token_i

        # if empty you cannot do anything but add a new word
        if not tokens:
            tokens.append(subtok.lstrip(tokenizer.INIT))
            backreferences.append(-1)
            current_token_i += 1

        # backref can't be splitted
        elif subw_backr > -1:
            tokens.append(None)
            backreferences.append(subword_to_token_map[subw_backr])
            current_token_i += 1

        # after a special token release
        elif isinstance(tokens[-1], str) and rex_spc.match(tokens[-1]):
            tokens.append(subtok.lstrip(tokenizer.INIT))
            backreferences.append(-1)
            current_token_i += 1

        # after a subtoken ':' (which should be followed by the rest of the edge) ignore tokenizer.INIT
        # TODO: this is an ugly patch due to the fact that BART tokenizer splits after ':'
        elif (tokens[-1] == ":") and rex_arg.match(subtok):
            tokens[-1] = tokens[-1] + subtok[1:]

        # leading tokenizer.INIT
        elif subtok.startswith(tokenizer.INIT):
            tokens.append(subtok.lstrip(tokenizer.INIT))
            backreferences.append(-1)
            current_token_i += 1

        # very ugly patch for some cases in which tokenizer.INIT is not in the following token to the edge
        elif (
            isinstance(tokens[-1], str)
            and tokens[-1].startswith(":")
            and tokens[-1][-1].isdigit()
            and (subtok != "-of")
        ):
            tokens.append(subtok.lstrip(tokenizer.INIT))
            backreferences.append(-1)
            current_token_i += 1

        # in any other case attach to the previous
        else:
            tokens[-1] = tokens[-1] + subtok

    # strip INIT and fix byte-level
    tokens = [
        tokenizer.convert_tokens_to_string(list(t)).lstrip() if isinstance(t, str) else t
        for t in tokens
    ]
    # tokens = [t.replace(tokenizer.INIT, '') if isinstance(t, str) else t for t in tokens]

    # unks are substituted with thing
    tokens = [t if t != "<unk>" else "thing" for t in tokens]

    old_tokens = tokens
    old_backreferences = backreferences

    # <lit> Barack Obama </lit> -> "Barack Obama"
    tokens = []
    backreferences = []
    token_to_token_map = {}
    start_search = 0
    removed = 0
    while True:
        try:

            lit_start = old_tokens.index("<lit>", start_search)
            token_addition = old_tokens[start_search:lit_start]
            for i, t in enumerate(token_addition, start=start_search):
                token_to_token_map[i] = i - removed
            tokens += token_addition

            backreferences_addition = [
                token_to_token_map[b] if b > -1 else -1
                for b in old_backreferences[start_search:lit_start]
            ]
            backreferences += backreferences_addition

            lit_end = min(lit_start + 2, len(old_tokens) - 1)

            while lit_end < len(old_tokens):
                old_tok = old_tokens[lit_end]

                if isinstance(old_tok, str) and (
                    (old_tok.startswith(":") and len(old_tok) > 3) or (old_tok == "<stop>")
                ):
                    res_tok = old_tokens[lit_start + 1 : lit_end]
                    for i in range(lit_start, lit_end):
                        token_to_token_map[i] = len(tokens)

                    # Remove possible wrong None
                    res = old_tokens[lit_start + 1 : lit_end]
                    res = [str(r) for r in res if r is not None]
                    res = '"' + "_".join(res) + '"'

                    removed += len(res_tok)
                    start_search = lit_end
                    tokens += [res, old_tok]
                    backreferences += [-1, -1]
                    break

                elif old_tok == "</lit>":
                    res_tok = old_tokens[lit_start + 1 : lit_end]
                    for i in range(lit_start, lit_end + 1):
                        token_to_token_map[i] = len(tokens)

                    # Remove possible wrong None
                    res = old_tokens[lit_start + 1 : lit_end]
                    res = [str(r) for r in res if r is not None]
                    res = '"' + "_".join(res) + '"'

                    removed += len(res_tok) + 1
                    start_search = lit_end + 1
                    tokens.append(res)
                    backreferences.append(-1)
                    break

                else:
                    lit_end += 1
                    start_search = lit_end

        except ValueError:
            token_addition = old_tokens[start_search:]
            for i, t in enumerate(token_addition, start=start_search):
                token_to_token_map[i] = i - removed
            backreferences_addition = [
                token_to_token_map[b] if b > -1 else b for b in old_backreferences[start_search:]
            ]
            tokens += token_addition
            backreferences += backreferences_addition
            break

    tokens = [token_processing(t) for t in tokens]

    shift = 1
    if tokens[1] == "<s>":
        shift = 2

    tokens = tokens[shift:]
    backreferences = [b if b == -1 else b - shift for b in backreferences[shift:]]

    if tokens[-1] == "</s>":
        tokens.pop()
        backreferences.pop()

    return tokens, backreferences


def index_of(element, iterable, default=None, start=None, end=None):
    if not callable(element):

        def check(x):
            return element == x

    else:
        check = element
    if start is None:
        start = 0
    if end is None:
        end = len(iterable)
    item = start
    while item < end:
        if check(iterable[item]):
            return item
        item += 1
    return default


def separate_edges_nodes(edges_nodes_slice, *other):
    is_arg = lambda x: isinstance(x, str) and x.startswith(":")
    start = 0
    edges = []
    nodes = []
    l = len(edges_nodes_slice)
    while start < l:
        edge_index = index_of(is_arg, edges_nodes_slice, start=start)
        if edge_index is None or edge_index == (l - 1):
            break
        if is_arg(edges_nodes_slice[edge_index + 1]):
            start = edge_index + 1
            continue
        edges.append(edge_index)
        nodes.append(edge_index + 1)
        start = edge_index + 2
    ret = []
    for oth in other:
        edges_oth = [oth[i] for i in edges]
        nodes_oth = [oth[i] for i in nodes]
        ret.append((edges_oth, nodes_oth))
    return ret


def _split_name_ops(graph):
    # identify name triples
    name_vars = {}
    for i, (v1, rel, v2) in enumerate(graph.triples):
        if rel == ":instance" and v2 == "name":
            name_vars[v1] = 1

    # check if they have ops
    name_vars_to_ops = defaultdict(list)
    for i, (v1, rel, v2) in enumerate(graph.triples):
        if v1 in name_vars and rel.startswith(":op"):
            name_vars_to_ops[v1].append((i, rel, v2.strip('"')))

    triples = graph.triples.copy()
    for nv, ops in name_vars_to_ops.items():
        ops = sorted(ops, key=lambda x: int(x[1][3:]))
        idx, _, lits = zip(*ops)
        for i in idx:
            triples[i] = None

        lits = ['"' + l + '"' for lit in lits for l in lit.split("_")]

        tt = []
        for i, l in enumerate(lits, start=1):
            rel = ":op" + str(i)
            tt.append(penman.Triple(nv, rel, l))

        triples[min(idx)] = tt

    triples = [t if isinstance(t, list) else [t] for t in triples if t is not None]
    triples = [t for tt in triples for t in tt]

    graph_ = penman.Graph(triples)
    graph_.metadata = graph.metadata
    return graph_


def _reconstruct_graph_from_nodes(nodes, backreferences):
    triples = []
    triples_added = set()

    variable2index = {}
    index2variable = {}
    start_index = 0

    cnt = defaultdict(Counter)

    while start_index < len(nodes):
        stop_index = index_of("<stop>", nodes, default=len(nodes) + 1, start=start_index)
        old_start_index = start_index
        start_index = stop_index + 1

        src_node, src_backr = nodes[old_start_index], backreferences[old_start_index]

        if src_node == "<stop>":
            continue

        trg_nodes_edges = nodes[old_start_index:stop_index]
        trg_nodes_edges_backr = backreferences[old_start_index:stop_index]
        trg_nodes_edges_indices = list(range(old_start_index, stop_index))

        if isinstance(src_node, str):
            if src_node in ("<s>", "</s>", "<stop>"):
                continue
            elif ("/" in src_node) or (":" in src_node) or ("(" in src_node) or (")" in src_node):
                src_node = "thing"

        if src_node is not None:
            src_node = str(src_node)
            src_var = src_node[0].lower()
            if not src_var not in "abcdefghijklmnopqrstuvwxyz":
                src_var = "x"
            # src_var = f'{src_var}_{len(variable2index)}'
            src_var = f"{src_var}{len(variable2index)}"
            src_var_i = old_start_index
            variable2index[src_var] = src_var_i
            index2variable[src_var_i] = src_var
            triple = penman.Triple(src_var, ":instance", src_node)
            if triple not in triples_added:
                triples.append(triple)
                triples_added.add(triple)
        else:
            if src_backr in index2variable:
                src_var = index2variable[src_backr]
        # more resilient logic here
        (trg_edges, trg_nodes), (_, trg_nodes_backr), (_, trg_nodes_indices) = separate_edges_nodes(
            trg_nodes_edges, trg_nodes_edges, trg_nodes_edges_backr, trg_nodes_edges_indices
        )

        for n, e, nb, ni in zip(trg_nodes, trg_edges, trg_nodes_backr, trg_nodes_indices):

            if isinstance(n, str) and n.startswith(":"):
                continue
            if isinstance(n, str) and n.startswith("<") and n.endswith(">"):
                continue
            if e == ":li":
                pass
            elif len(e) < 4 or (not e.startswith(":")):
                continue

            # same edge more than once
            num = cnt[src_var][e]
            # num = 0
            if num:

                if e.startswith(":op") or e.startswith(":snt"):
                    continue
                # elif e.startswith(':ARG'):
                #    continue
                elif num > 3:
                    continue

            if n is None:
                if nb not in index2variable:
                    continue
                trg_var = index2variable[nb]
                trg = trg_var
            elif e == ":mode":
                trg = n
            elif (
                (not isinstance(n, str))
                or re.match(r"^[+-]?\d+\.?\d*$", n)
                or (n == "-")
                or (n == "+")
            ):
                trg = str(n)
            elif n.startswith('"') and n.endswith('"') and len(n) > 2:
                trg = '"' + n.replace('"', "") + '"'
            elif ("/" in n) or (":" in n) or ("(" in n) or (")" in n) or ("=" in n):
                trg = f'"{n}"'
            elif n == '"':
                continue
            elif (
                (n.startswith('"') and (not n.endswith('"')))
                or (not n.startswith('"') and (n.endswith('"')))
                or ('"' in n)
            ):
                trg = '"' + n.replace('"', "") + '"'
            else:
                trg_var = n[0].lower()
                if trg_var not in "abcdefghijklmnopqrstuvwxyz":
                    trg_var = "x"
                # trg_var = f'{trg_var}_{len(variable2index)}'
                trg_var = f"{trg_var}{len(variable2index)}"
                trg_var_i = ni
                variable2index[trg_var] = trg_var_i
                index2variable[trg_var_i] = trg_var
                triple = penman.Triple(trg_var, ":instance", n)
                if triple not in triples_added:
                    triples.append(triple)
                    triples_added.add(triple)
                trg = trg_var

            triple = penman.Triple(src_var, e, trg)
            if triple not in triples_added:
                triples.append(triple)
                triples_added.add(triple)

            cnt[src_var][e] += 1

    return penman.Graph(triples)


def build_graph(nodes, backreferences, restore_name_ops=False):
    graph = _reconstruct_graph_from_nodes(nodes, backreferences)
    if restore_name_ops:
        graph = _split_name_ops(graph)
    return graph


class ParsedStatus(enum.Enum):
    OK = 0
    FIXED = 1
    BACKOFF = 2


def connect_graph_if_not_connected(graph):

    try:
        encoded = encode(graph)
        return graph, ParsedStatus.OK
    except:
        pass

    nxgraph = nx.MultiGraph()
    variables = graph.variables()
    for v1, _, v2 in graph.triples:
        if v1 in variables and v2 in variables:
            nxgraph.add_edge(v1, v2)
        elif v1 in variables:
            nxgraph.add_edge(v1, v1)

    triples = graph.triples.copy()
    new_triples = []
    addition = f"a{len(variables) + 1}"
    triples.append(penman.Triple(addition, ":instance", "and"))
    for i, conn_set in enumerate(nx.connected_components(nxgraph), start=1):
        edge = f":op{i}"
        conn_set = sorted(conn_set, key=lambda x: int(x[1:]))
        conn_set = [c for c in conn_set if c in variables]
        node = conn_set[0]
        new_triples.append(penman.Triple(addition, edge, node))
    triples = new_triples + triples
    metadata = graph.metadata
    graph = penman.Graph(triples)
    graph.metadata.update(metadata)
    encode(graph)

    return graph, ParsedStatus.FIXED


def restore_backreferences_from_pointers(nodes):
    new_nodes, new_backreferences = [], []
    prev_pointer = None
    pointer2i = {}
    for n in nodes:
        is_pointer = isinstance(n, str) and n.startswith("<pointer:") and n.endswith(">")

        if not is_pointer:
            if prev_pointer is not None:
                if prev_pointer in pointer2i:
                    new_nodes.append(None)
                    new_backreferences.append(pointer2i[prev_pointer])
                    new_nodes.append(n)
                    new_backreferences.append(-1)

                else:
                    pointer2i[prev_pointer] = len(new_nodes)
                    new_nodes.append(n)
                    new_backreferences.append(-1)
            else:
                new_nodes.append(n)
                new_backreferences.append(-1)

            prev_pointer = None
        else:
            prev_pointer = n
    return new_nodes, new_backreferences


================================================
FILE: hanlp/components/amr/amrbart/data_interface/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-12-07 14:36


================================================
FILE: hanlp/components/amr/amrbart/data_interface/dataset.py
================================================
# coding:utf-8
# MIT License
#
# Copyright (c) 2022 xfbai
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

class AMRParsingDataSet(object):

    @staticmethod
    def tokenize(sample: dict, tokenizer, max_src_length=400, max_tgt_length=1024, unified_input=True, amr="src",
                 text="tgt"):
        amr = sample.get(amr, None)  # AMR tokens
        txt = sample[text]  # Text tokens

        if amr is not None:
            sample['labels'] = tokenizer.tokenize_amr(amr.split())[:max_src_length - 2] + [tokenizer.amr_eos_token_id]

        raw_txt_ids = tokenizer(
            txt, max_length=max_tgt_length, padding=False, truncation=True
        )["input_ids"]
        if unified_input:
            txt_ids = raw_txt_ids[:max_tgt_length - 3] + [tokenizer.amr_bos_token_id, tokenizer.mask_token_id,
                                                          tokenizer.amr_eos_token_id]
        else:
            txt_ids = raw_txt_ids
        sample['input_ids'] = txt_ids
        return sample


class AMR2TextDataSet(object):

    @staticmethod
    def tokenize(sample: dict, tokenizer, max_src_length=400, max_tgt_length=1024, unified_input=True, amr="src",
                 text="tgt"):
        src = sample[amr]  # AMR tokens
        tgt = sample.get(text, None)  # Text tokens
        if not unified_input:
            src_ids = [tokenizer.amr_bos_token_id] + tokenizer.tokenize_amr(src.split())[
                                                     :max_src_length - 2] + [tokenizer.amr_eos_token_id]

        else:
            # [<s>[mask]</s><AMR>xxx</AMR>]
            src_ids = [tokenizer.bos_token_id, tokenizer.mask_token_id, tokenizer.eos_token_id] + [
                tokenizer.amr_bos_token_id] + tokenizer.tokenize_amr(src.split())[:max_src_length - 5] + [
                          tokenizer.amr_eos_token_id]
        sample["input_ids"] = src_ids

        if tgt is not None:
            with tokenizer.as_target_tokenizer():
                tgt_ids = tokenizer(
                    tgt, max_length=max_tgt_length, padding=False, truncation=True
                )
                tgt_ids["input_ids"] = [
                    label[1:] for label in tgt_ids["input_ids"]
                ]
            sample["labels"] = tgt_ids["input_ids"]
        return sample


================================================
FILE: hanlp/components/amr/amrbart/model_interface/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-12-03 20:33


================================================
FILE: hanlp/components/amr/amrbart/model_interface/modeling_bart.py
================================================
# coding=utf-8
# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch BART model."""
import copy
import math
import random
import warnings
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.activations import ACT2FN
from transformers.modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
    Seq2SeqQuestionAnsweringModelOutput,
    Seq2SeqSequenceClassifierOutput,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import (
    add_code_sample_docstrings,
    add_end_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from transformers.models.bart.configuration_bart import BartConfig


logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "facebook/bart-base"
_CONFIG_FOR_DOC = "BartConfig"
_TOKENIZER_FOR_DOC = "BartTokenizer"

# Base model docstring
_EXPECTED_OUTPUT_SHAPE = [1, 8, 768]

# SequenceClassification docstring
_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "valhalla/bart-large-sst2"
_SEQ_CLASS_EXPECTED_LOSS = 0.0
_SEQ_CLASS_EXPECTED_OUTPUT = "'POSITIVE'"

# QuestionAsnwering docstring
_CHECKPOINT_FOR_QA = "valhalla/bart-large-finetuned-squadv1"
_QA_EXPECTED_LOSS = 0.59
_QA_EXPECTED_OUTPUT = "' nice puppet'"


BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/bart-large",
    # see all BART models at https://huggingface.co/models?filter=bart
]


def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    """
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    shifted_input_ids[:, 0] = decoder_start_token_id

    if pad_token_id is None:
        raise ValueError("self.model.config.pad_token_id has to be defined.")
    # replace possible -100 values in labels by `pad_token_id`
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    return shifted_input_ids


def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
    """
    Make causal mask used for bi-directional self-attention.
    """
    bsz, tgt_len = input_ids_shape
    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
    mask_cond = torch.arange(mask.size(-1))
    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
    mask = mask.to(dtype)

    if past_key_values_length > 0:
        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)


def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    """
    bsz, src_len = mask.size()
    tgt_len = tgt_len if tgt_len is not None else src_len

    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)

    inverted_mask = 1.0 - expanded_mask

    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)


class BartLearnedPositionalEmbedding(nn.Embedding):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, num_embeddings: int, embedding_dim: int):
        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
        # and adjust num_embeddings appropriately. Other models don't have this hack
        self.offset = 2
        super().__init__(num_embeddings + self.offset, embedding_dim)

    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
        """`input_ids_shape` is expected to be [bsz x seqlen]."""
        bsz, seq_len = input_ids_shape[:2]
        positions = torch.arange(
            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
        )
        return super().forward(positions + self.offset)


class BartAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder

        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        # if key_value_states are provided this layer is used as a cross-attention layer
        # for the decoder
        is_cross_attention = key_value_states is not None

        bsz, tgt_len, _ = hidden_states.size()

        # get query proj
        query_states = self.q_proj(hidden_states) * self.scaling
        # get key, value proj
        if is_cross_attention and past_key_value is not None:
            # reuse k,v, cross_attentions
            key_states = past_key_value[0]
            value_states = past_key_value[1]
        elif is_cross_attention:
            # cross_attentions
            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
        elif past_key_value is not None:
            # reuse k, v, self_attention
            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
            key_states = torch.cat([past_key_value[0], key_states], dim=2)
            value_states = torch.cat([past_key_value[1], value_states], dim=2)
        else:
            # self_attention
            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)

        if self.is_decoder:
            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
            # Further calls to cross_attention layer can then reuse all cross-attention
            # key/value_states (first "if" case)
            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
            # all previous decoder key/value_states. Further calls to uni-directional self-attention
            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
            # if encoder bi-directional self-attention `past_key_value` is always `None`
            past_key_value = (key_states, value_states)

        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
        key_states = key_states.view(*proj_shape)
        value_states = value_states.view(*proj_shape)

        src_len = key_states.size(1)
        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))

        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        attn_weights = nn.functional.softmax(attn_weights, dim=-1)

        if layer_head_mask is not None:
            if layer_head_mask.size() != (self.num_heads,):
                raise ValueError(
                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
                    f" {layer_head_mask.size()}"
                )
            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        if output_attentions:
            # this operation is a bit awkward, but it's required to
            # make sure that attn_weights keeps its gradient.
            # In order to do so, attn_weights have to be reshaped
            # twice and have to be reused in the following
            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
        else:
            attn_weights_reshaped = None

        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)

        attn_output = torch.bmm(attn_probs, value_states)

        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
        attn_output = attn_output.transpose(1, 2)

        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
        # partitioned aross GPUs when using tensor-parallelism.
        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

        attn_output = self.out_proj(attn_output)

        return attn_output, attn_weights_reshaped, past_key_value


class BartEncoderLayer(nn.Module):
    def __init__(self, config: BartConfig):
        super().__init__()
        self.embed_dim = config.d_model
        self.self_attn = BartAttention(
            embed_dim=self.embed_dim,
            num_heads=config.encoder_attention_heads,
            dropout=config.attention_dropout,
        )
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(
        self,
        hidden_states: torch.FloatTensor,
        attention_mask: torch.FloatTensor,
        layer_head_mask: torch.FloatTensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        residual = hidden_states
        hidden_states, attn_weights, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        hidden_states = self.self_attn_layer_norm(hidden_states)

        residual = hidden_states
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        hidden_states = self.fc2(hidden_states)
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        hidden_states = self.final_layer_norm(hidden_states)

        if hidden_states.dtype == torch.float16 and (
            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
        ):
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (attn_weights,)

        return outputs


class BartDecoderLayer(nn.Module):
    def __init__(self, config: BartConfig):
        super().__init__()
        self.embed_dim = config.d_model

        self.self_attn = BartAttention(
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
        )
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout

        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.encoder_attn = BartAttention(
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
        )
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = True,
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        residual = hidden_states

        # Self Attention
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # add present self-attn cache to positions 1,2 of present_key_value tuple
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            past_key_value=self_attn_past_key_value,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        hidden_states = self.self_attn_layer_norm(hidden_states)

        # Cross-Attention Block
        cross_attn_present_key_value = None
        cross_attn_weights = None
        if encoder_hidden_states is not None:
            residual = hidden_states

            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
                hidden_states=hidden_states,
                key_value_states=encoder_hidden_states,
                attention_mask=encoder_attention_mask,
                layer_head_mask=cross_attn_layer_head_mask,
                past_key_value=cross_attn_past_key_value,
                output_attentions=output_attentions,
            )
            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
            hidden_states = residual + hidden_states
            hidden_states = self.encoder_attn_layer_norm(hidden_states)

            # add cross-attn to positions 3,4 of present_key_value tuple
            present_key_value = present_key_value + cross_attn_present_key_value

        # Fully Connected
        residual = hidden_states
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        hidden_states = self.fc2(hidden_states)
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        hidden_states = self.final_layer_norm(hidden_states)

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights, cross_attn_weights)

        if use_cache:
            outputs += (present_key_value,)

        return outputs


class BartClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(
        self,
        input_dim: int,
        inner_dim: int,
        num_classes: int,
        pooler_dropout: float,
    ):
        super().__init__()
        self.dense = nn.Linear(input_dim, inner_dim)
        self.dropout = nn.Dropout(p=pooler_dropout)
        self.out_proj = nn.Linear(inner_dim, num_classes)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.dense(hidden_states)
        hidden_states = torch.tanh(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.out_proj(hidden_states)
        return hidden_states


class BartPretrainedModel(PreTrainedModel):
    config_class = BartConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"]

    def _init_weights(self, module):
        std = self.config.init_std
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()

    def _set_gradient_checkpointing(self, module, value=False):
        if isinstance(module, (BartDecoder, BartEncoder)):
            module.gradient_checkpointing = value

    @property
    def dummy_inputs(self):
        pad_token = self.config.pad_token_id
        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
        dummy_inputs = {
            "attention_mask": input_ids.ne(pad_token),
            "input_ids": input_ids,
        }
        return dummy_inputs


class PretrainedBartModel(BartPretrainedModel):
    def __init_subclass__(self):
        warnings.warn(
            "The class `PretrainedBartModel` has been depreciated, please use `BartPretrainedModel` instead.",
            FutureWarning,
        )


BART_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`BartConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

BART_GENERATION_EXAMPLE = r"""
    Summarization example:

    ```python
    >>> from transformers import BartTokenizer, BartForConditionalGeneration

    >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

    >>> ARTICLE_TO_SUMMARIZE = (
    ...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
    ...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
    ...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
    ... )
    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")

    >>> # Generate Summary
    >>> summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20)
    >>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    'PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions'
    ```

    Mask filling example:

    ```python
    >>> from transformers import BartTokenizer, BartForConditionalGeneration

    >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
    >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

    >>> TXT = "My friends are <mask> but they eat too many carbs."
    >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
    >>> logits = model(input_ids).logits

    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
    >>> probs = logits[0, masked_index].softmax(dim=0)
    >>> values, predictions = probs.topk(5)

    >>> tokenizer.decode(predictions).split()
    ['not', 'good', 'healthy', 'great', 'very']
    ```
"""

BART_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape
            `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you
            can choose to directly pass an embedded representation. This is useful if you want more control over how to
            convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
            input (see `past_key_values`). This is useful if you want more control over how to convert
            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.

            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
            of `inputs_embeds`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


class BartEncoder(BartPretrainedModel):
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`BartEncoderLayer`].

    Args:
        config: BartConfig
        embed_tokens (nn.Embedding): output embedding
    """

    def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)

        self.dropout = config.dropout
        self.layerdrop = config.encoder_layerdrop

        embed_dim = config.d_model
        self.padding_idx = config.pad_token_id
        self.max_source_positions = config.max_position_embeddings
        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

        if embed_tokens is not None:
            self.embed_tokens = embed_tokens
        else:
            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)

        self.embed_positions = BartLearnedPositionalEmbedding(
            config.max_position_embeddings,
            embed_dim,
        )
        self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)])
        self.layernorm_embedding = nn.LayerNorm(embed_dim)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        r"""
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            input_shape = input_ids.size()
            input_ids = input_ids.view(-1, input_shape[-1])
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

        embed_pos = self.embed_positions(input_shape)

        hidden_states = inputs_embeds + embed_pos
        hidden_states = self.layernorm_embedding(hidden_states)
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

        # expand attention_mask
        if attention_mask is not None:
            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)

        encoder_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        # check if head_mask has a correct number of layers specified if desired
        if head_mask is not None:
            if head_mask.size()[0] != (len(self.layers)):
                raise ValueError(
                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
                    f" {head_mask.size()[0]}."
                )

        for idx, encoder_layer in enumerate(self.layers):
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            dropout_probability = random.uniform(0, 1)
            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                layer_outputs = (None, None)
            else:
                if self.gradient_checkpointing and self.training:

                    def create_custom_forward(module):
                        def custom_forward(*inputs):
                            return module(*inputs, output_attentions)

                        return custom_forward

                    layer_outputs = torch.utils.checkpoint.checkpoint(
                        create_custom_forward(encoder_layer),
                        hidden_states,
                        attention_mask,
                        (head_mask[idx] if head_mask is not None else None),
                    )
                else:
                    layer_outputs = encoder_layer(
                        hidden_states,
                        attention_mask,
                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                        output_attentions=output_attentions,
                    )

                hidden_states = layer_outputs[0]

            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)

        if not return_dict:
            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
        )


class BartDecoder(BartPretrainedModel):
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BartDecoderLayer`]

    Args:
        config: BartConfig
        embed_tokens (nn.Embedding): output embedding
    """

    def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)
        self.dropout = config.dropout
        self.layerdrop = config.decoder_layerdrop
        self.padding_idx = config.pad_token_id
        self.max_target_positions = config.max_position_embeddings
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

        if embed_tokens is not None:
            self.embed_tokens = embed_tokens
        else:
            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)

        self.embed_positions = BartLearnedPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
        )
        self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)])
        self.layernorm_embedding = nn.LayerNorm(config.d_model)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
        # create causal mask
        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
        combined_attention_mask = None
        if input_shape[-1] > 1:
            combined_attention_mask = _make_causal_mask(
                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
            ).to(inputs_embeds.device)

        if attention_mask is not None:
            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
            combined_attention_mask = (
                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
            )

        return combined_attention_mask

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
        r"""
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
                cross-attention on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
                control over how to convert `input_ids` indices into associated vectors than the model's internal
                embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
        elif input_ids is not None:
            input_shape = input_ids.size()
            input_ids = input_ids.view(-1, input_shape[-1])
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")

        # past_key_values_length
        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

        attention_mask = self._prepare_decoder_attention_mask(
            attention_mask, input_shape, inputs_embeds, past_key_values_length
        )

        # expand encoder attention mask
        if encoder_hidden_states is not None and encoder_attention_mask is not None:
            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])

        # embed positions
        positions = self.embed_positions(input_shape, past_key_values_length)

        hidden_states = inputs_embeds + positions
        hidden_states = self.layernorm_embedding(hidden_states)

        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
        next_decoder_cache = () if use_cache else None

        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
            if attn_mask is not None:
                if attn_mask.size()[0] != (len(self.layers)):
                    raise ValueError(
                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
                        f" {head_mask.size()[0]}."
                    )

        for idx, decoder_layer in enumerate(self.layers):
            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            if output_hidden_states:
                all_hidden_states += (hidden_states,)
            dropout_probability = random.uniform(0, 1)
            if self.training and (dropout_probability < self.layerdrop):
                continue

            past_key_value = past_key_values[idx] if past_key_values is not None else None

            if self.gradient_checkpointing and self.training:

                if use_cache:
                    logger.warning(
                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                    )
                    use_cache = False

                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        # None for past_key_value
                        return module(*inputs, output_attentions, use_cache)

                    return custom_forward

                layer_outputs = torch.utils.checkpoint.checkpoint(
                    create_custom_forward(decoder_layer),
                    hidden_states,
                    attention_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    head_mask[idx] if head_mask is not None else None,
                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
                    None,
                )
            else:

                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=attention_mask,
                    encoder_hidden_states=encoder_hidden_states,
                    encoder_attention_mask=encoder_attention_mask,
                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                    cross_attn_layer_head_mask=(
                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
                    ),
                    past_key_value=past_key_value,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                )
            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

                if encoder_hidden_states is not None:
                    all_cross_attentions += (layer_outputs[2],)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = next_decoder_cache if use_cache else None
        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
                if v is not None
            )
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            cross_attentions=all_cross_attentions,
        )


@add_start_docstrings(
    "The bare BART Model outputting raw hidden-states without any specific head on top.",
    BART_START_DOCSTRING,
)
class BartModel(BartPretrainedModel):
    def __init__(self, config: BartConfig):
        super().__init__(config)

        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)

        self.encoder = BartEncoder(config, self.shared)
        self.decoder = BartDecoder(config, self.shared)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.shared

    def set_input_embeddings(self, value):
        self.shared = value
        self.encoder.embed_tokens = self.shared
        self.decoder.embed_tokens = self.shared

    def get_encoder(self):
        return self.encoder

    def get_decoder(self):
        return self.decoder

    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        processor_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=Seq2SeqModelOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, Seq2SeqModelOutput]:

        # different to other models, Bart automatically creates decoder_input_ids from
        # input_ids if no decoder_input_ids are provided
        if decoder_input_ids is None and decoder_inputs_embeds is None:
            if input_ids is None:
                raise ValueError(
                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
                    "passed, `input_ids` cannot be `None`. Please pass either "
                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
                )

            decoder_input_ids = shift_tokens_right(
                input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
            )

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if encoder_outputs is None:
            encoder_outputs = self.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
            encoder_outputs = BaseModelOutput(
                last_hidden_state=encoder_outputs[0],
                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
            )

        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_mask,
            encoder_hidden_states=encoder_outputs[0],
            encoder_attention_mask=attention_mask,
            head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        if not return_dict:
            return decoder_outputs + encoder_outputs

        return Seq2SeqModelOutput(
            last_hidden_state=decoder_outputs.last_hidden_state,
            past_key_values=decoder_outputs.past_key_values,
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
            encoder_hidden_states=encoder_outputs.hidden_states,
            encoder_attentions=encoder_outputs.attentions,
        )


@add_start_docstrings(
    "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
)
class BartForConditionalGeneration(BartPretrainedModel):
    base_model_prefix = "model"
    _keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head.weight"]

    def __init__(self, config: BartConfig):
        super().__init__(config)
        self.model = BartModel(config)
        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_encoder(self):
        return self.model.get_encoder()

    def get_decoder(self):
        return self.model.get_decoder()

    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
        new_embeddings = super().resize_token_embeddings(new_num_tokens)
        self._resize_final_logits_bias(new_num_tokens)
        return new_embeddings

    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
        old_num_tokens = self.final_logits_bias.shape[-1]
        if new_num_tokens <= old_num_tokens:
            new_bias = self.final_logits_bias[:, :new_num_tokens]
        else:
            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
        self.register_buffer("final_logits_bias", new_bias)

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    @add_end_docstrings(BART_GENERATION_EXAMPLE)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, Seq2SeqLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if labels is not None:
            if use_cache:
                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
            use_cache = False
            if decoder_input_ids is None and decoder_inputs_embeds is None:
                decoder_input_ids = shift_tokens_right(
                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
                )

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            encoder_outputs=encoder_outputs,
            decoder_attention_mask=decoder_attention_mask,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias

        masked_lm_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            output = (lm_logits,) + outputs[1:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        return Seq2SeqLMOutput(
            loss=masked_lm_loss,
            logits=lm_logits,
            past_key_values=outputs.past_key_values,
            decoder_hidden_states=outputs.decoder_hidden_states,
            decoder_attentions=outputs.decoder_attentions,
            cross_attentions=outputs.cross_attentions,
            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
            encoder_hidden_states=outputs.encoder_hidden_states,
            encoder_attentions=outputs.encoder_attentions,
        )

    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past=None,
        attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs
    ):
        # cut decoder_input_ids if past is used
        if past is not None:
            decoder_input_ids = decoder_input_ids[:, -1:]

        return {
            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
            "encoder_outputs": encoder_outputs,
            "past_key_values": past,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
        }

    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)

    @staticmethod
    def _reorder_cache(past, beam_idx):
        reordered_past = ()
        for layer_past in past:
            # cached cross_attention states don't have to be reordered -> they are always the same
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
            )
        return reordered_past


@add_start_docstrings(
    """
    Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    """,
    BART_START_DOCSTRING,
)
class BartForSequenceClassification(BartPretrainedModel):
    def __init__(self, config: BartConfig, **kwargs):
        super().__init__(config, **kwargs)
        self.model = BartModel(config)
        self.classification_head = BartClassificationHead(
            config.d_model,
            config.d_model,
            config.num_labels,
            config.classifier_dropout,
        )
        self.model._init_weights(self.classification_head.dense)
        self.model._init_weights(self.classification_head.out_proj)

    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        processor_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
        output_type=Seq2SeqSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
    )
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        if labels is not None:
            use_cache = False

        if input_ids is None and inputs_embeds is not None:
            raise NotImplementedError(
                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
            )

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            encoder_outputs=encoder_outputs,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]  # last hidden state

        eos_mask = input_ids.eq(self.config.eos_token_id)

        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
            raise ValueError("All examples must have the same number of <eos> tokens.")
        sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
            :, -1, :
        ]
        logits = self.classification_head(sentence_representation)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.config.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.config.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return Seq2SeqSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            decoder_hidden_states=outputs.decoder_hidden_states,
            decoder_attentions=outputs.decoder_attentions,
            cross_attentions=outputs.cross_attentions,
            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
            encoder_hidden_states=outputs.encoder_hidden_states,
            encoder_attentions=outputs.encoder_attentions,
        )


@add_start_docstrings(
    """
    BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    BART_START_DOCSTRING,
)
class BartForQuestionAnswering(BartPretrainedModel):
    def __init__(self, config):
        super().__init__(config)

        config.num_labels = 2
        self.num_labels = config.num_labels

        self.model = BartModel(config)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        self.model._init_weights(self.qa_outputs)

    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        processor_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_QA,
        output_type=Seq2SeqQuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_loss=_QA_EXPECTED_LOSS,
        expected_output=_QA_EXPECTED_OUTPUT,
    )
    def forward(
        self,
        input_ids: torch.Tensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, Seq2SeqQuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        if start_positions is not None and end_positions is not None:
            use_cache = False

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            encoder_outputs=encoder_outputs,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (
                start_logits,
                end_logits,
            ) + outputs[1:]
            return ((total_loss,) + output) if total_loss is not None else output

        return Seq2SeqQuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            past_key_values=outputs.past_key_values,
            decoder_hidden_states=outputs.decoder_hidden_states,
            decoder_attentions=outputs.decoder_attentions,
            cross_attentions=outputs.cross_attentions,
            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
            encoder_hidden_states=outputs.encoder_hidden_states,
            encoder_attentions=outputs.encoder_attentions,
        )


class BartDecoderWrapper(BartPretrainedModel):
    """
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    """

    def __init__(self, config):
        super().__init__(config)
        self.decoder = BartDecoder(config)

    def forward(self, *args, **kwargs):
        return self.decoder(*args, **kwargs)


class BartForCausalLM(BartPretrainedModel):
    def __init__(self, config):
        config = copy.deepcopy(config)
        config.is_decoder = True
        config.is_encoder_decoder = False
        super().__init__(config)
        self.model = BartDecoderWrapper(config)

        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.decoder.embed_tokens

    def set_input_embeddings(self, value):
        self.model.decoder.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model.decoder = decoder

    def get_decoder(self):
        return self.model.decoder

    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
        r"""
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                if the model is configured as a decoder.
            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
                in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
                tensors are only required when the model is used as a decoder in a Sequence to Sequence model.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        Returns:

        Example:

        ```python
        >>> from transformers import BartTokenizer, BartForCausalLM

        >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
        >>> model = BartForCausalLM.from_pretrained("facebook/bart-base", add_cross_attention=False)
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
        >>> list(logits.shape) == expected_shape
        True
        ```"""

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model.decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            head_mask=head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        logits = self.lm_head(outputs[0])

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output

        return CausalLMOutputWithCrossAttentions(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )

    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, use_cache=None, **kwargs):
        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
        if attention_mask is None:
            attention_mask = input_ids.new_ones(input_ids.shape)

        if past:
            input_ids = input_ids[:, -1:]
        # first step, decoder_cached_states are empty
        return {
            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
            "attention_mask": attention_mask,
            "past_key_values": past,
            "use_cache": use_cache,
        }

    @staticmethod
    def _reorder_cache(past, beam_idx):
        reordered_past = ()
        for layer_past in past:
            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
        return reordered_past

================================================
FILE: hanlp/components/amr/amrbart/model_interface/tokenization_bart.py
================================================
# coding:utf-8
# this is a simplified version of "https://github.com/SapienzaNLP/spring/blob/main/spring_amr/tokenization_bart.py"
# MIT License
#
# Copyright (c) 2022 xfbai
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import penman
import regex as re
from transformers import BartTokenizer

from hanlp.components.amr.amrbart.common import postprocessing
from hanlp.components.amr.amrbart.common.constant import raw_special_tokens, recategorizations
from hanlp.components.amr.amrbart.common.penman_interface import encode


class AMRBartTokenizer(BartTokenizer):
    INIT = 'Ġ'

    def __init__(self, vocab_file, merges_file, errors="replace", bos_token="<s>", eos_token="</s>", sep_token="</s>", cls_token="<s>", unk_token="<unk>", pad_token="<pad>", mask_token="<mask>", add_prefix_space=False, **kwargs):
        super().__init__(vocab_file, merges_file, errors, bos_token, eos_token, sep_token, cls_token, unk_token, pad_token, mask_token, add_prefix_space, **kwargs)
        self.modified = 0
        self.recategorizations = set(recategorizations)
        self.patterns = re.compile(r""" ?<[a-z]+:?\d*>| ?:[^\s]+|'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
        self.remove_pars = False

    @classmethod
    def from_pretrained(cls, pretrained_model_path, *args, **kwargs):
        inst = super().from_pretrained(pretrained_model_path, *args, **kwargs)
        inst.init_amr_vocabulary()
        return inst

    def init_amr_vocabulary(self):
        self.old_enc_size = old_enc_size = len(self.encoder)
        tokens = [t for t in raw_special_tokens if t not in self.encoder]

        for i, t in enumerate(tokens, start=old_enc_size):
            self.encoder[t] = i

        self.encoder = {k: i for i, (k,v) in enumerate(sorted(self.encoder.items(), key=lambda x: x[1]))}
        self.decoder = {v: k for k, v in sorted(self.encoder.items(), key=lambda x: x[1])}
        self.modified = len(tokens)

        self.amr_bos_token = "<AMR>"
        self.amr_bos_token_id = self.encoder[self.amr_bos_token]
        self.amr_eos_token = "</AMR>"
        self.amr_eos_token_id = self.encoder[self.amr_eos_token]
        # print(f"Added {self.modified} AMR tokens")

    def _tokenize(self, text):
        """ Tokenize a string. Modified in order to handle sentences with recategorization pointers"""
        bpe_tokens = []
        for tok_span in text.lstrip().split(' '):
            tok_span = tok_span.strip()
            recats = tok_span.rsplit('_', 1)
            if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder:
                bpe_tokens.extend([self.INIT + recats[0], '_' + recats[1]])
            else:
                for token in re.findall(self.pat, ' ' + tok_span):
                    token = "".join(
                        self.byte_encoder[b] for b in token.encode("utf-8")
                    )   # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
                    bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))

        return bpe_tokens

    def _tok_bpe(self, token):
        tokk = []
        tok = token.strip()
        recats = tok.rsplit('_', 1)
        if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder:
            tokk.extend([self.INIT + recats[0], '_' + recats[1]])
        else:
            for tok in self.patterns.findall(' ' + token):
                tok = "".join(
                    self.byte_encoder[b] for b in tok.encode("utf-8"))
                toks = self.bpe(tok).split(' ')
                tokk.extend(toks)
        return tokk

    def tokenize_amr(self, amr_tokens):
        bpe_tokens = []
        for i, tokk in enumerate(amr_tokens):
            is_in_enc = self.INIT + tokk in self.encoder
            is_rel = tokk.startswith(':') and len(tokk) > 1
            is_spc = tokk.startswith('<') and tokk.endswith('>')
            is_of = tokk.startswith(':') and tokk.endswith('-of')
            is_frame = re.match(r'.+-\d\d', tokk) is not None

            if tokk.startswith('"') and tokk.endswith('"'):                 # dealing with examples like "The_United_Kingdom_of_xxx"
                tokk = tokk[1:-1].replace('_', ' ')
                bpe_toks = [self.INIT + "<lit>"]
                bpe_toks += self._tok_bpe(tokk)
                bpe_toks.append(self.INIT + "</lit>")

            elif (is_rel or is_spc or is_frame or is_of):
                if is_in_enc:
                    bpe_toks = [self.INIT + tokk]
                elif is_frame:
                    bpe_toks = self._tok_bpe(tokk[:-3]) + [tokk[-3:]]
                elif is_of:
                    rel = tokk[:-3]
                    if self.INIT + rel in self.encoder:
                        bpe_toks = [self.INIT + rel, '-of']
                    else:
                        bpe_toks = [self.INIT + ':'] + self._tok_bpe(rel[1:]) + ['-of']
                elif is_rel:
                    bpe_toks = [self.INIT + ':'] + self._tok_bpe(tokk[1:])
                else:
                    print("tok:", tokk)
                    print(f"is_rel:{is_rel}, is_spc:{is_spc}, is_frame:{is_frame}, is_of:{is_of}")
                    exit()
                    raise
            else:
                if is_in_enc:
                    bpe_toks = [self.INIT + tokk]
                else:
                    bpe_toks = self._tok_bpe(tokk)

            bpe_tokens.append(bpe_toks)
        bpe_tokens = [b for bb in bpe_tokens for b in bb]
        bpe_token_ids = [self.encoder.get(b, self.unk_token_id) for b in bpe_tokens]
        return bpe_token_ids

    def decode_amr(self, tokens, restore_name_ops=None):
        try:
            nodes, backreferences = postprocessing.decode_into_node_and_backreferences(tokens, self)
        except Exception as e:
            # print('Decoding failure:', file=sys.stderr)
            # print(e, file=sys.stderr)
            return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
        try:
            graph_ = graph = self._fix_and_make_graph(nodes)
            # if collapse_name_ops:
            #     graph_ = graph = postprocessing._split_name_ops(graph)
        except Exception as e:
            # print('Building failure:', file=sys.stderr)
            # print(nodes, file=sys.stderr)
            # print(backreferences, file=sys.stderr)
            # print(e, file=sys.stderr)
            return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
        try:
            graph, status = postprocessing.connect_graph_if_not_connected(graph)
            # if status == postprocessing.ParsedStatus.BACKOFF:
            #     print('Reconnection 1 failure:')
            #     print(nodes, file=sys.stderr)
            #     print(backreferences, file=sys.stderr)
            #     print(graph_, file=sys.stderr)
            return graph, status, (nodes, backreferences)
        except Exception as e:
            # print('Reconnction 2 failure:', file=sys.stderr)
            # print(e, file=sys.stderr)
            # print(nodes, file=sys.stderr)
            # print(backreferences, file=sys.stderr)
            # print(graph_, file=sys.stderr)
            return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (nodes, backreferences)

    def _fix_and_make_graph(self, nodes):

        nodes_ = []
        for n in nodes:
            if isinstance(n, str):
                if n.startswith('<') and n.endswith('>') and (not n.startswith('<pointer:')):
                    pass
                else:
                    nodes_.append(n)
            else:
                nodes_.append(n)
        nodes = nodes_

        if True:
            i = 0
            nodes_ = []
            while i < len(nodes):
                nxt = nodes[i]
                pst = None
                if isinstance(nxt, str) and nxt.startswith('<pointer:'):
                    e = nxt.find('>')
                    if e != len(nxt) -1:
                        pst = nxt[e+1:]
                        nxt = nxt[:e+1]
                    nodes_.append(nxt)
                    if pst is not None:
                        nodes_.append(pst)
                else:
                    nodes_.append(nxt)
                i += 1
            nodes = nodes_

            i = 1
            nodes_ = [nodes[0]]
            while i < len(nodes):
                nxt = nodes[i]
                if isinstance(nxt, str) and nxt.startswith('<pointer:'):
                    nxt = 'z' + nxt[9:-1]
                    fol = nodes[i+1]
                    # is not expansion
                    if isinstance(fol, str) and (fol.startswith(':') or (fol == ')')):
                        nodes_.append(nxt)
                    else:
                        if self.remove_pars:
                            nodes_.append('(')
                        else:
                            if nodes_[-1] != '(':
                                nodes_.append('(')
                                #pass
                        nodes_.append(nxt)
                        nodes_.append('/')
                else:
                    nodes_.append(nxt)
                i += 1
            nodes = nodes_

        i = 0
        nodes_ = []
        while i < (len(nodes) - 1):
            if nodes[i] == ':':
                nodes_.append(nodes[i] + nodes[i+1])
                i += 2
                last = False
            else:
                nodes_.append(nodes[i])
                i += 1
                last = True
        if last:
            nodes_.append(nodes[-1])
        nodes = nodes_

        i = 0
        nodes_ = []
        while i < (len(nodes)):
            if i < 2:
                nodes_.append(nodes[i])
                i += 1
            elif nodes_[-2] == '/' and nodes[i] == '/':
                i += 2
            else:
                nodes_.append(nodes[i])
                i += 1
        nodes = nodes_

        i = 0
        newvars = 0
        variables = set()
        remap = {}
        nodes_ = []
        while i < (len(nodes)):

            next = nodes[i]

            if next == '/':
                last = nodes_[-1]
                if last in variables:
                    last_remap = f"z{newvars+1000}"
                    newvars += 1
                    nodes_[-1] = last_remap
                    remap[last] = last_remap
                variables.add(last)
                nodes_.append(next)

            elif self._classify(next) == 'VAR' and next in remap and (i < len(nodes) - 1) and nodes[i+1] != '/':
                next = remap[next]
                nodes_.append(next)

            else:
                nodes_.append(next)

            i += 1

        nodes = nodes_
        pieces_ = []
        open_cnt = 0
        closed_cnt = 0
        if nodes[0] != '(':
            pieces_.append('(')
            open_cnt += 1
        for p in nodes:
            if p == '(':
                open_cnt += 1
            elif p == ')':
                closed_cnt += 1
            pieces_.append(p)
            if open_cnt == closed_cnt:
                break
        nodes = pieces_ + [')'] * (open_cnt - closed_cnt)

        pieces = []
        for piece in nodes:
            if not pieces:
                pieces.append('(')
            else:
                piece = str(piece)
                if piece.startswith('"') or piece.startswith('"') or '"' in piece.strip('"'):
                    piece = '"' + piece.replace('"', '') + '"'

                prev = self._classify(pieces[-1])
                next = self._classify(piece)

                if next == 'CONST':
                    quote = False
                    for char in (',', ':', '/', '(', ')', '.', '!', '?', '\\', '_', '='):
                        if char in piece:
                            quote = True
                            break
                    if quote:
                        piece = '"' + piece.strip('"') + '"'

                if  prev == '(':
                    if next in ('VAR', 'I'):
                        pieces.append(piece)
                elif prev == ')':
                    if next in (')', 'EDGE', 'MODE'):
                        pieces.append(piece)
                elif prev == 'VAR':
                    if next in ('/', 'EDGE', 'MODE', ')'):
                        pieces.append(piece)
                elif prev == '/':
                    if next in ('INST', 'I'):
                        pieces.append(piece)
                elif prev == 'INST':
                    if next in (')', 'EDGE', 'MODE'):
                        pieces.append(piece)
                elif prev == 'I':
                    if next in ('/', ')', 'EDGE', 'MODE'):
                        pieces.append(piece)
                elif prev == 'EDGE':
                    if next in ('(', 'VAR', 'CONST', 'I'):
                        pieces.append(piece)
                    elif next == ')':
                        pieces[-1] = piece
                    elif next in ('EDGE', 'MODE'):
                        pieces[-1] = piece
                elif prev == 'MODE':
                    if next == 'INST':
                        pieces.append(piece)
                elif prev == 'CONST':
                    if next in (')', 'EDGE', 'MODE'):
                        pieces.append(piece)

        pieces_ = []
        open_cnt = 0
        closed_cnt = 0
        if pieces[0] != '(':
            pieces_.append('(')
            open_cnt += 1
        for p in pieces:
            if p == '(':
                open_cnt += 1
            elif p == ')':
                closed_cnt += 1
            pieces_.append(p)
            if open_cnt == closed_cnt:
                break
        pieces = pieces_ + [')'] * (open_cnt - closed_cnt)

        linearized = re.sub(r'\s+', ' ', ' '.join(pieces)).strip()

        """
        line = linearized
        # make sure parentheses match
        # copied from https://github.com/RikVN/AMR/blob/master/restoreAMR/restore_amr.py
        open_count = 0
        close_count = 0
        for i, c in enumerate(line):
            if c == '(':
                open_count += 1
            elif c == ')':
                close_count += 1
            if open_count == close_count and open_count > 0:
                line = line[:i].strip()
                break
        old_line = line
        while True:
            open_count = len(re.findall(r'\(', line))
            close_count = len(re.findall(r'\)', line))
            if open_count > close_count:
                line += ')' * (open_count - close_count)
            elif close_count > open_count:
                for i in range(close_count - open_count):
                    line = line.rstrip(')')
                    line = line.rstrip(' ')
            if old_line == line:
                break
            old_line = line
        """

        graph = penman.decode(linearized + ' ')
        triples = []
        newvars = 2000
        for triple in graph.triples:
            x, rel, y = triple
            if x is None:
                pass
            elif rel == ':instance' and y is None:
                triples.append(penman.Triple(x, rel, 'thing'))
            elif y is None:
                var = f'z{newvars}'
                newvars += 1
                triples.append(penman.Triple(x, rel, var))
                triples.append(penman.Triple(var, ':instance', 'thing'))
            else:
                triples.append(triple)
        graph = penman.Graph(triples)
        linearized = encode(graph)

        def fix_text(linearized=linearized):
            n = 0
            def _repl1(match):
                nonlocal n
                out = match.group(1) + match.group(2) + str(3000 + n) + ' / ' + match.group(2) + match.group(3)
                n += 1
                return out
            linearized = re.sub(r'(\(\s?)([a-z])([^\/:\)]+[:\)])', _repl1, linearized,
                                flags=re.IGNORECASE | re.MULTILINE)

            def _repl2(match):
                return match.group(1)
            linearized = re.sub(r'(\(\s*[a-z][\d+]\s*\/\s*[^\s\)\(:\/]+\s*)((?:/\s*[^\s\)\(:\/]+\s*)+)', _repl2,
                                linearized,
                                flags=re.IGNORECASE | re.MULTILINE)

            # adds a ':' to args w/o it
            linearized = re.sub(r'([^:])(ARG)', r'\1 :\2', linearized)

            # removes edges with no node
            # linearized = re.sub(r':[^\s\)\(:\/]+?\s*\)', ')', linearized, flags=re.MULTILINE)

            return linearized

        linearized = fix_text(linearized)
        g = penman.decode(linearized)
        return g

    def _classify(self, node):
        if not isinstance(node, str):
            return "CONST"
        elif node == 'i':
            return "I"
        elif re.match(r'^[a-z]\d*$', node) is not None:
            return "VAR"
        elif node[0].isdigit():
            return "CONST"
        elif node.startswith('"') and node.endswith('"'):
            return "CONST"
        elif node in ('+', '-'):
            return "CONST"
        elif node == ':mode':
            return 'MODE'
        elif node.startswith(':'):
            return "EDGE"
        elif node in ['/', '(', ')']:
            return node
        elif node[0].isalpha():
            for char in (',', ':', '/', '(', ')', '.', '!', '?', '\\'):
                if char in node:
                    return "CONST"
            return "INST"
        else:
            return 'CONST'

================================================
FILE: hanlp/components/amr/amrbart/preprocess/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-12-03 20:33


================================================
FILE: hanlp/components/amr/amrbart/preprocess/amr_io.py
================================================
# coding:utf-8
# the code is migrated from https://github.com/SapienzaNLP/spring 
# MIT License
#
# Copyright (c) 2022 xfbai
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import glob
from pathlib import Path
from typing import List, Union, Iterable
from hanlp.components.amr.amrbart.preprocess.penman_interface import load as pm_load


def read_raw_amr_data(
        paths: List[Union[str, Path]], use_recategorization=False, dereify=True, remove_wiki=False,
):
    """ code for loading AMR from a set of files
        - use_recategorization: use graph recategorization trick
        - dereify: Dereify edges in g that have reifications in model.
        - remove_wiki: remove wiki links
    """
    assert paths
    if not isinstance(paths, Iterable):
        paths = [paths]

    graphs = []
    for path_ in paths:
        for path in glob.glob(str(path_)):
            path = Path(path)
            graphs.extend(pm_load(path, dereify=dereify, remove_wiki=remove_wiki))

    assert graphs

    if use_recategorization:
        for g in graphs:
            metadata = g.metadata
            metadata["snt_orig"] = metadata["snt"]
            tokens = eval(metadata["tokens"])
            metadata["snt"] = " ".join(
                [
                    t
                    for t in tokens
                    if not ((t.startswith("-L") or t.startswith("-R")) and t.endswith("-"))
                ]
            )

    return graphs


================================================
FILE: hanlp/components/amr/amrbart/preprocess/penman_interface.py
================================================
# coding:utf-8
# MIT License
#
# Copyright (c) 2022 xfbai
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from penman import load as load_, Graph, Triple
from penman import loads as loads_
from penman import encode as encode_
from penman.model import Model
from penman.models.noop import NoOpModel
from penman.models import amr

op_model = Model()
noop_model = NoOpModel()
amr_model = amr.model
DEFAULT = op_model


def _get_model(dereify):
    if dereify is None:
        return DEFAULT

    elif dereify:
        return op_model

    else:
        return noop_model


def _remove_wiki(graph):
    metadata = graph.metadata
    triples = []
    for t in graph.triples:
        v1, rel, v2 = t
        if rel == ":wiki":
            t = Triple(v1, rel, "+")
        triples.append(t)
    graph = Graph(triples)
    graph.metadata = metadata
    return graph


def load(source, dereify=None, remove_wiki=False):
    model = _get_model(dereify)
    out = load_(source=source, model=model)
    if remove_wiki:
        for i in range(len(out)):
            out[i] = _remove_wiki(out[i])
    return out


def loads(string, dereify=None, remove_wiki=False):
    model = _get_model(dereify)
    out = loads_(string=string, model=model)
    if remove_wiki:
        for i in range(len(out)):
            out[i] = _remove_wiki(out[i])
    return out


def encode(g, top=None, indent=-1, compact=False):
    model = amr_model
    return encode_(g=g, top=top, indent=indent, compact=compact, model=model)


================================================
FILE: hanlp/components/amr/amrbart/preprocess/read_and_process.py
================================================
# coding:utf-8
# MIT License
#
# Copyright (c) 2022 xfbai
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import re
import copy
import json
import yaml
import penman
from tqdm import tqdm
from pathlib import Path
from hanlp.components.amr.amrbart.preprocess.amr_io import read_raw_amr_data


def _tokenize_encoded_graph(encoded):
    linearized = re.sub(r"(\".+?\")", r" \1 ", encoded)
    pieces = []
    for piece in linearized.split():
        if piece.startswith('"') and piece.endswith('"'):
            pieces.append(piece)
        else:
            piece = piece.replace("(", " ( ")
            piece = piece.replace(")", " ) ")
            piece = piece.replace(":", " :")
            piece = piece.replace("/", " / ")
            piece = piece.strip()
            pieces.append(piece)
    linearized = re.sub(r"\s+", " ", " ".join(pieces)).strip()
    return linearized.split(" ")


def dfs_linearize(graph, remove_pars=False, use_pointer_tokens=True):
    graph_ = copy.deepcopy(graph)
    graph_.metadata = {}
    linearized = penman.encode(graph_)
    linearized_nodes = _tokenize_encoded_graph(linearized)

    if use_pointer_tokens:
        remap = {}
        for i in range(1, len(linearized_nodes)):
            nxt = linearized_nodes[i]
            lst = linearized_nodes[i - 1]
            if nxt == "/":
                remap[lst] = f"<pointer:{len(remap)}>"
        i = 1
        linearized_nodes_ = [linearized_nodes[0]]
        while i < (len(linearized_nodes)):
            nxt = linearized_nodes[i]
            lst = linearized_nodes_[-1]
            if nxt in remap:
                if lst == "(" and linearized_nodes[i + 1] == "/":
                    nxt = remap[nxt]
                    i += 1
                elif lst.startswith(":"):
                    nxt = remap[nxt]
            linearized_nodes_.append(nxt)
            i += 1
        linearized_nodes = linearized_nodes_
        if remove_pars:
            linearized_nodes = [n for n in linearized_nodes if n != "("]
    return linearized_nodes


def main():
    from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
    parser = ArgumentParser(
        description="AMR processing script",
        formatter_class=ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument('--config', type=Path, default='default.yaml',
                        help='Use the following config for hparams.')
    parser.add_argument('--input_file', type=str,
                        help='The input AMR file.')
    parser.add_argument('--output_prefix', type=str,
                        help='The output_prefix.')

    args, unknown = parser.parse_known_args()

    with args.config.open() as y:
        config = yaml.load(y, Loader=yaml.FullLoader)

    remove_pars = False
    use_pointer_tokens = True
    graphs = read_raw_amr_data(
        [args.input_file],
        use_recategorization=config["use_recategorization"],
        remove_wiki=config["remove_wiki"],
        dereify=config["dereify"],
    )

    line_amr, sentences = [], []

    for g in tqdm(graphs):
        lin_tokens = dfs_linearize(g)
        sentences.append(g.metadata["snt"])
        # line_amr.append(" ".join(lin_tokens[1:-1]))
        line_amr.append(" ".join(lin_tokens))

    print(f"all {len(line_amr)} AMRs processed")

    with open(args.output_prefix + ".amr", "w", encoding="utf-8") as fout:
        fout.write("\n".join(line_amr) + "\n")

    with open(args.output_prefix + ".txt", "w", encoding="utf-8") as fout:
        fout.write("\n".join(sentences) + "\n")

    res_out = [json.dumps({"sent": sent, "amr": lamr}) for lamr, sent in zip(line_amr, sentences)]

    with open(args.output_prefix + ".jsonl", "w", encoding="utf-8") as fout:
        fout.write("\n".join(res_out) + "\n")


if __name__ == '__main__':
    main()


================================================
FILE: hanlp/components/amr/seq2seq/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-04-27 19:24


================================================
FILE: hanlp/components/amr/seq2seq/dataset/IO.py
================================================
import glob
from typing import List, Union, Iterable
from pathlib import Path
from .penman import pm_load as pm_load


def read_raw_amr_data(
        paths: List[Union[str, Path]],
        use_recategorization=False,
        dereify=True,
        remove_wiki=False,
):
    assert paths

    if not isinstance(paths, Iterable):
        paths = [paths]

    graphs = []
    for path_ in paths:
        for path in glob.glob(str(path_)):
            path = Path(path)
            assert path.exists(), f'{path} not exist'
            graphs.extend(pm_load(path, dereify=dereify, remove_wiki=remove_wiki))

    assert graphs, 'No graphs loaded'

    if use_recategorization:
        for g in graphs:
            metadata = g.metadata
            metadata['snt_orig'] = metadata['snt']
            tokens = eval(metadata['tokens'])
            metadata['snt'] = ' '.join(
                [t for t in tokens if not ((t.startswith('-L') or t.startswith('-R')) and t.endswith('-'))])

    return graphs


================================================
FILE: hanlp/components/amr/seq2seq/dataset/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-04-27 19:29


================================================
FILE: hanlp/components/amr/seq2seq/dataset/dataset.py
================================================
from collections import Counter
from typing import Union, List, Callable, Tuple
import torch
import penman
from penman import Graph
from hanlp.common.dataset import TransformableDataset
from hanlp.components.amr.seq2seq.dataset.IO import read_raw_amr_data
from hanlp.components.amr.seq2seq.dataset.penman import role_is_reverted
from hanlp.components.amr.seq2seq.dataset.tokenization_bart import PENMANBartTokenizer
from phrasetree.tree import Tree
import json

from hanlp_common.constant import BOS, EOS, ROOT
from hanlp_common.io import load_pickle


class AMRDataset(TransformableDataset):

    def __init__(self,
                 data: Union[str, List],
                 use_recategorization=False,
                 remove_wiki=False,
                 dereify=False,
                 transform: Union[Callable, List] = None,
                 cache=None,
                 generate_idx=None) -> None:
        self.dereify = dereify
        self.remove_wiki = remove_wiki
        self.use_recategorization = use_recategorization
        super().__init__(data, transform, cache, generate_idx)

    def load_file(self, filepath: str):
        graphs = read_raw_amr_data([filepath], self.use_recategorization, remove_wiki=self.remove_wiki,
                                   dereify=self.dereify)
        for g in graphs:
            yield {'amr': g}

    def get_roles(self):
        roles = Counter()
        for sample in self.data:
            g: Graph = sample['amr']
            for s, r, t in g.triples:
                if role_is_reverted(r):
                    r = r[:-3]
                roles[r] += 1
        return roles

    def get_frames(self):
        frames = Counter()
        for sample in self.data:
            g: Graph = sample['amr']
            for i in g.instances():
                t = i.target
                cells = t.split('-')
                if len(cells) == 2 and len(cells[1]) == 2 and cells[1].isdigit():
                    frames[t] += 1
        return frames


class AMRPickleDataset(AMRDataset):

    def load_file(self, filepath: str):
        items = torch.load(filepath)
        for each in items:
            each['amr'] = penman.decode(each['amr'])
            yield each


def dfs_linearize_tokenize(sample: dict, tokenizer: PENMANBartTokenizer, remove_space=False, text_key='snt') -> dict:
    amr = sample.get('amr', None)
    if amr:
        l, e = tokenizer.linearize(amr)
        sample['graph_tokens'] = e['linearized_graphs']
        sample['graph_token_ids'] = l
        text = amr.metadata[text_key]
    else:
        text = sample['text']
    if remove_space:
        text = ''.join(text.split())
    sample['text'] = text
    sample['text_token_ids'] = tokenizer.encode(text)
    return sample


def dfs_linearize_levi(sample: dict, tokenizer: PENMANBartTokenizer, remove_space=False) -> dict:
    amr = sample.get('amr', None)
    if amr:
        l, e = tokenizer.linearize(amr)
        sample['graph_tokens'] = e['linearized_graphs']
        sample['graph_token_ids'] = l
        tok = json.loads(amr.metadata['tok'])
        dep = json.loads(amr.metadata['dep'])
        levi = dep_to_levi(tok, dep)
        sample['text'] = ' '.join(levi)
        # ids = sum(tokenizer.batch_encode_plus([' ' + x for x in levi], add_special_tokens=False).input_ids, [])
        ids = []
        idx = 0
        for t in levi:
            if t in ('(', ')'):
                ids.append(tokenizer.convert_tokens_to_ids(tokenizer.INIT + t))
            else:
                if idx % 2:
                    ids.extend(tokenizer.encode(t, add_special_tokens=False))
                else:
                    ids.append(tokenizer.convert_tokens_to_ids(tokenizer.INIT + t))
                idx += 1
        sample['text_token_ids'] = [tokenizer.bos_token_id] + ids + [tokenizer.eos_token_id]
    return sample


def dfs_linearize_rgcn(sample: dict, tokenizer: PENMANBartTokenizer) -> dict:
    amr = sample.get('amr', None)
    if amr:
        l, e = tokenizer.linearize(amr)
        sample['graph_tokens'] = e['linearized_graphs']
        sample['graph_token_ids'] = l
        tok = sample['tok']
        sample['text'] = [tokenizer.cls_token] + [' ' + x for x in tok]
        arc_scores = sample['dep']['scores']['arc_scores']
        rel_scores = sample['dep']['scores']['rel_scores']
        dep_graph = arc_scores[:, :, None] * rel_scores
        root = torch.zeros((1,) + dep_graph.shape[1:])
        sample['dep_graph'] = torch.cat([root, dep_graph], dim=0)
    return sample


def dfs_linearize_constituency(sample: dict, tokenizer: PENMANBartTokenizer, remove_space=False) -> dict:
    amr = sample.get('amr', None)
    if amr:
        l, e = tokenizer.linearize(amr)
        sample['graph_tokens'] = e['linearized_graphs']
        sample['graph_token_ids'] = l
        tree = Tree.from_list(json.loads(sample['amr'].metadata['con_list']))
        for each in tree.subtrees(lambda x: x.height() == 2):
            if each[0] == '(':
                each[0] = '<LBR>'
            elif each[0] == ')':
                each[0] = '<RBR>'
        text = tree.pformat(margin=10e7)
        tokens = []
        buffer = []
        for c in text:
            if c == '(' or c == ')':
                tokens.append(''.join(buffer))
                tokens.append(c)
                buffer.clear()
                continue
            buffer.append(c)
        if buffer:
            tokens.append(''.join(buffer))
        tokens = [x.strip() for x in tokens]
        tokens = [x for x in tokens if x]
        restore_bracket = {'<LBR>': '(', '<RBR>': ')'}
        tokens = [restore_bracket.get(x, x) for x in tokens]
        ids = []
        for each in tokens:
            pairs = each.split(' ', 1)
            if len(pairs) == 2:
                con, token = pairs
                ids.append(tokenizer.convert_tokens_to_ids(tokenizer.INIT + con))
                ids.extend(tokenizer.encode(token, add_special_tokens=False))
            else:
                ids.append(tokenizer.convert_tokens_to_ids(tokenizer.INIT + each))
        if remove_space:
            text = ''.join(text.split())
        sample['text'] = text
        sample['text_token_ids'] = [tokenizer.bos_token_id] + ids + [tokenizer.eos_token_id]
    return sample


def dfs_linearize_tokenize_with_linguistic_structures(sample: dict, tokenizer: PENMANBartTokenizer,
                                                      remove_space=False,
                                                      text_key='snt') -> dict:
    amr = sample.get('amr', None)
    if amr:
        l, e = tokenizer.linearize(amr)
        sample['graph_tokens'] = e['linearized_graphs']
        sample['graph_token_ids'] = l
        text = amr.metadata[text_key]
        if remove_space:
            text = ''.join(text.split())
        sample['text'] = text
        tok = json.loads(amr.metadata['tok'])
        text_token_ids = tokenizer.batch_encode_plus(tok, add_special_tokens=False).input_ids
        sample['text_token_ids'] = [tokenizer.bos_token_id] + sum(text_token_ids, []) + [tokenizer.eos_token_id]
        pos = amr.metadata.get('pos', None)
        if pos:
            flat_pos = []
            pos = json.loads(pos)
            for subtokens, tag in zip(text_token_ids, pos):
                flat_pos.extend([tag] * len(subtokens))
            sample['pos'] = [BOS] + flat_pos + [EOS]
        ner = amr.metadata.get('ner', None)
        if ner is not None:
            flat_ner = []
            ner_spans = json.loads(ner)
            ner = ['O'] * len(text_token_ids)
            for form, tag, start, end in ner_spans:
                ner[start:end] = [tag] * (end - start)
            for subtokens, tag in zip(text_token_ids, ner):
                flat_ner.extend([tag] * len(subtokens))
            sample['ner'] = [BOS] + flat_ner + [EOS]
        dep = amr.metadata.get('dep', None)
        if dep:
            token_to_1st_subtoken = [0]
            num_subtokens = 1  # 1 for BOS
            for subtokens in text_token_ids:
                token_to_1st_subtoken.append(num_subtokens)
                num_subtokens += len(subtokens)
            flat_arc, flat_rel = [0], [BOS]
            dep = json.loads(dep)
            for subtokens, (arc, rel) in zip(text_token_ids, dep):
                flat_arc.extend([token_to_1st_subtoken[arc]] * len(subtokens))
                flat_rel.extend([rel] * len(subtokens))
            sample['dep_arc'] = flat_arc + [0]
            sample['dep_rel'] = flat_rel + [EOS]
    return sample


def dep_to_levi(tok: List[str], dep: List[Tuple[int, str]]):
    root = [i for i, x in enumerate(dep) if x[0] == 0][0]
    seq = []
    dfs(tok, dep, root, seq)
    return seq


def dfs(tok: List[str], dep: List[Tuple[int, str]], s, seq):
    seq.append(dep[s][1])
    seq.append(tok[s])
    children = [i for i, x in enumerate(dep) if x[0] == s + 1]
    if children:
        seq.append('(')
        for child in children:
            dfs(tok, dep, child, seq)
        seq.append(')')


================================================
FILE: hanlp/components/amr/seq2seq/dataset/linearization.py
================================================
import abc
import itertools
from collections import deque, defaultdict
import re
from typing import List, Optional, Dict, Any, Set, TypeVar
from dataclasses import dataclass
import networkx as nx
import penman


@dataclass
class SemanticGraph:
    nodes_var: List[str]
    """
    List of linearized nodes, with special tokens.
    """
    edges: Optional[List[str]]
    """
    List of linearized edges, with special tokens.
    """
    backreferences: List[int]
    """
    List of backpointers to handle rentrancies and cycles.
    """
    var2instance: Dict[str, str]
    """
    Dict from var ids to 'lemmatized' readable strings qualifying the node (collapsing the :instance edge for AMR).
    """
    extra: Dict[str, Any]
    """
    Holds extra stuff that might be useful, e.g. alignments, NER, EL.
    """

    # @cached_property
    @property
    def variables(self) -> Set[str]:
        """Set of variables in this semantic graph"""
        variables = {v for v in self.nodes_var if not v.startswith('<')}
        return variables

    @property
    def resolved_nodes_var(self) -> List[str]:
        return [self.nodes_var[b] for b in self.backreferences]

    # @cached_property
    @property
    def nodes(self) -> List[str]:
        """Linearized nodes with varids replaced by instances"""
        return [self.var2instance.get(node, node) for node in self.nodes_var]

    @property
    def resolved_nodes(self) -> List[str]:
        return [self.nodes[b] for b in self.backreferences]

    def src_occurrence(self, var: str) -> int:
        pass


class BaseLinearizer(metaclass=abc.ABCMeta):

    @abc.abstractmethod
    def linearize(self, *args, **kwargs) -> SemanticGraph:
        pass


class AMRTokens:
    START, END = '<', '>'
    _TEMPL = START + '{}' + END

    BOS_N = _TEMPL.format('s')
    EOS_N = _TEMPL.format('/s')
    START_N = _TEMPL.format('start')
    STOP_N = _TEMPL.format('stop')
    PNTR_N = _TEMPL.format('pointer')

    LIT_START = _TEMPL.format('lit')
    LIT_END = _TEMPL.format('/lit')

    BACKR_SRC_N = _TEMPL.format('backr:src:XXX')
    BACKR_TRG_N = _TEMPL.format('backr:trg:XXX')

    BOS_E = _TEMPL.format('s')
    EOS_E = _TEMPL.format('/s')
    START_E = _TEMPL.format('start')
    STOP_E = _TEMPL.format('stop')

    _FIXED_SPECIAL_TOKENS_N = {
        BOS_N, EOS_N, START_N, STOP_N}
    _FIXED_SPECIAL_TOKENS_E = {
        BOS_E, EOS_E, START_E, STOP_E}
    _FIXED_SPECIAL_TOKENS = _FIXED_SPECIAL_TOKENS_N | _FIXED_SPECIAL_TOKENS_E

    # match and read backreferences
    _re_BACKR_SRC_N = re.compile(BACKR_SRC_N.replace('XXX', r'([0-9]+)'))
    _re_BACKR_TRG_N = re.compile(BACKR_TRG_N.replace('XXX', r'([0-9]+)'))

    @classmethod
    def is_node(cls, string: str) -> bool:
        if isinstance(string, str) and string.startswith(':'):
            return False
        elif string in cls._FIXED_SPECIAL_TOKENS_E:
            return False
        return True

    @classmethod
    def read_backr(cls, string: str) -> Optional:
        m_src = cls._re_BACKR_SRC_N.search(string)
        if m_src is not None:
            return m_src
        m_trg = cls._re_BACKR_TRG_N.search(string)
        if m_trg is not None:
            return m_trg
        return None


T = TypeVar('T')


def index_default(
        item: T, list_: List[T],
        start: Optional[int] = None,
        stop: Optional[int] = None,
        default: Optional[int] = None
):
    if start is None:
        start = 0
    if stop is None:
        stop = len(list_)
    return next((i for i, x in enumerate(list_[start:stop], start=start) if x == item), default)


class AMRLinearizer(BaseLinearizer):

    def __init__(
            self,
            use_pointer_tokens: bool = True,
            collapse_name_ops: bool = False,
    ):
        self.collapse_name_ops = collapse_name_ops
        self.interleave_edges = False
        self.use_pointer_tokens = use_pointer_tokens

    def _collapse_name_ops(self, amr):
        # identify name triples
        name_vars = {}
        for i, (v1, rel, v2) in enumerate(amr.triples):
            if rel == ':instance' and v2 == 'name':
                name_vars[v1] = 1

        # check if they have ops
        name_vars_to_ops = defaultdict(list)
        for i, (v1, rel, v2) in enumerate(amr.triples):
            if v1 in name_vars and rel.startswith(':op'):
                name_vars_to_ops[v1].append((i, rel, v2.strip('"')))

        triples = amr.triples.copy()
        for nv, ops in name_vars_to_ops.items():
            ops = sorted(ops, key=lambda x: int(x[1][3:]))
            idx, _, lits = zip(*ops)
            for i in idx:
                triples[i] = None
            lit = '"' + '_'.join(lits) + '"'
            triples[min(idx)] = penman.Triple(nv, ':op1', lit)

        triples = [t for t in triples if t is not None]
        amr_ = penman.Graph(triples)
        amr_.metadata = amr.metadata
        return amr_

    def linearize(self, amr: penman.Graph) -> SemanticGraph:
        if self.collapse_name_ops:
            amr = self._collapse_name_ops(amr)
        linearized = self._linearize(amr)
        linearized = self._interleave(linearized)
        if self.use_pointer_tokens:
            linearized = self._add_pointer_tokens(linearized)
        return linearized

    def _linearize(self, amr: penman.Graph) -> SemanticGraph:
        variables = set(amr.variables())
        variables = {'var:' + v for v in variables}
        var2instance = {}

        graph = nx.MultiDiGraph()

        triples2order = {k: i for i, k in enumerate(amr.triples)}

        for triple in amr.triples:
            var, rel, instance = triple
            order = triples2order[triple]
            if rel != ':instance':
                continue
            for expansion_candidate in itertools.chain(range(order - 1, -1), range(order + 1, len(amr.triples))):
                if var == amr.triples[expansion_candidate][2]:
                    expansion = expansion_candidate
                    break
            else:
                expansion = 0
            var = 'var:' + var
            var2instance[var] = instance
            graph.add_node(var, instance=instance, order=order, expansion=expansion)

        for triple in amr.edges():
            var1, rel, var2 = triple
            order = triples2order[triple]
            if rel == ':instance':
                continue
            var1 = 'var:' + var1
            var2 = 'var:' + var2
            graph.add_edge(var1, var2, rel=rel, order=order)

        for triple in amr.attributes():
            var, rel, attr = triple
            order = triples2order[triple]
            if rel == ':instance':
                continue
            var = 'var:' + var
            graph.add_edge(var, attr, rel=rel, order=order)

        # nodes that are not reachable from the root (e.g. because of reification)
        # will be present in the not_explored queue
        # undirected_graph = graph.to_undirected()
        # print(amr.variables())
        not_explored = deque(sorted(variables, key=lambda x: nx.get_node_attributes(graph, 'order')[x]))
        # (
        #     len(nx.shortest_path(undirected_graph, 'var:' + amr.top, x)),
        #     -graph.out_degree(x),
        # )

        first_index = {}
        explored = set()
        added_to_queue = set()
        nodes_visit = [AMRTokens.BOS_N]
        edges_visit = [AMRTokens.BOS_E]
        backreferences = [0]
        queue = deque()
        queue.append('var:' + amr.top)

        while queue or not_explored:

            if queue:
                node1 = queue.popleft()
            else:
                node1 = not_explored.popleft()
                if node1 in added_to_queue:
                    continue
                if not list(graph.successors(node1)):
                    continue

            if node1 in variables:
                if node1 in explored:
                    continue
                if node1 in first_index:
                    nodes_visit.append(AMRTokens.BACKR_TRG_N)
                    backreferences.append(first_index[node1])
                else:
                    backreferences.append(len(nodes_visit))
                    first_index[node1] = len(nodes_visit)
                    nodes_visit.append(node1)
                edges_visit.append(AMRTokens.START_E)

                successors = []
                for node2 in graph.successors(node1):
                    for edge_data in graph.get_edge_data(node1, node2).values():
                        rel = edge_data['rel']
                        order = edge_data['order']
                        successors.append((order, rel, node2))
                successors = sorted(successors)

                for order, rel, node2 in successors:
                    edges_visit.append(rel)

                    # node2 is a variable
                    if node2 in variables:
                        # ... which was mentioned before
                        if node2 in first_index:
                            nodes_visit.append(AMRTokens.BACKR_TRG_N)
                            backreferences.append(first_index[node2])

                        # .. which is mentioned for the first time
                        else:
                            backreferences.append(len(nodes_visit))
                            first_index[node2] = len(nodes_visit)
                            nodes_visit.append(node2)

                        # 1) not already in Q
                        # 2) has children
                        # 3) the edge right before its expansion has been encountered
                        if (node2 not in added_to_queue) and list(graph.successors(node2)) and (
                                nx.get_node_attributes(graph, 'expansion')[node2] <= order):
                            queue.append(node2)
                            added_to_queue.add(node2)

                    # node2 is a constant
                    else:
                        backreferences.append(len(nodes_visit))
                        nodes_visit.append(node2)

                backreferences.append(len(nodes_visit))
                nodes_visit.append(AMRTokens.STOP_N)
                edges_visit.append(AMRTokens.STOP_E)
                explored.add(node1)

            else:
                backreferences.append(len(nodes_visit))
                nodes_visit.append(node1)
                explored.add(node1)

        backreferences.append(len(nodes_visit))
        nodes_visit.append(AMRTokens.EOS_N)
        edges_visit.append(AMRTokens.EOS_E)
        assert len(nodes_visit) == len(edges_visit) == len(backreferences)
        return SemanticGraph(
            nodes_visit,
            edges_visit,
            backreferences,
            var2instance,
            extra={'graph': graph, 'amr': amr}
        )

    def _interleave(self, graph: SemanticGraph) -> SemanticGraph:

        new_backreferences_map = []
        new_nodes = []
        new_edges = None
        new_backreferences = []

        # to isolate sublist to the stop token
        start_i = 1
        end_i = index_default(AMRTokens.STOP_N, graph.nodes_var, start_i, -1, -1)

        def add_node(node, backr=None):
            old_n_node = len(new_backreferences_map)
            new_n_node = len(new_nodes)

            if backr is None:
                backr = old_n_node

            new_backreferences_map.append(new_n_node)
            new_nodes.append(node)
            if old_n_node == backr:
                new_backreferences.append(new_n_node)
            else:
                new_backreferences.append(new_backreferences_map[backr])

        def add_edge(edge):
            new_nodes.append(edge)
            new_backreferences.append(len(new_backreferences))

        add_node(AMRTokens.BOS_N)

        while end_i > -1:

            # src node
            add_node(graph.nodes_var[start_i], graph.backreferences[start_i])

            # edges and trg nodes, interleaved
            nodes = graph.nodes_var[start_i + 1:end_i]
            edges = graph.edges[start_i + 1:end_i]
            backr = graph.backreferences[start_i + 1:end_i]
            for n, e, b in zip(nodes, edges, backr):
                add_edge(e)
                add_node(n, b)

            # stop
            add_node(graph.nodes_var[end_i], graph.backreferences[end_i])

            start_i = end_i + 1
            end_i = index_default(AMRTokens.STOP_N, graph.nodes_var, start_i, -1, -1)

        add_node(AMRTokens.EOS_N)

        new_graph = SemanticGraph(
            new_nodes,
            None,
            new_backreferences,
            graph.var2instance,
            extra=graph.extra,
        )
        return new_graph

    def _add_pointer_tokens(self, graph: SemanticGraph) -> SemanticGraph:
        new_nodes = []
        var2pointer = {}
        for node, backr in zip(graph.nodes_var, graph.backreferences):

            if node == AMRTokens.BACKR_TRG_N:
                node = graph.nodes_var[backr]
                pointer = var2pointer[node]
                new_nodes.append(pointer)
            elif node in graph.var2instance:
                pointer = var2pointer.setdefault(node, f"<pointer:{len(var2pointer)}>")
                new_nodes.append(pointer)
                new_nodes.append(node)
            else:
                new_nodes.append(node)

        new_backreferences = list(range(len(new_nodes)))
        new_graph = SemanticGraph(
            new_nodes,
            None,
            new_backreferences,
            graph.var2instance,
            extra=graph.extra,
        )
        return new_graph


================================================
FILE: hanlp/components/amr/seq2seq/dataset/penman.py
================================================
from typing import List

from penman import load as load_, Graph, Triple
from penman import loads as loads_
from penman import encode as encode_
from penman.model import Model
from penman.models.noop import NoOpModel
from penman.models import amr
import penman
import logging

op_model = Model()
noop_model = NoOpModel()
amr_model = amr.model
DEFAULT = op_model

# Mute loggers
penman.layout.logger.setLevel(logging.CRITICAL)
penman._parse.logger.setLevel(logging.CRITICAL)


def _get_model(dereify):
    if dereify is None:
        return DEFAULT
    elif dereify:
        return op_model
    else:
        return noop_model


def _remove_wiki(graph):
    metadata = graph.metadata
    triples = []
    for t in graph.triples:
        v1, rel, v2 = t
        if rel == ':wiki':
            t = Triple(v1, rel, '+')
        triples.append(t)
    graph = Graph(triples)
    graph.metadata = metadata
    return graph


def pm_load(source, dereify=None, remove_wiki=False) -> List[penman.Graph]:
    """

    Args:
        source:
        dereify: Restore reverted relations
        remove_wiki:

    Returns:

    """
    model = _get_model(dereify)
    out = load_(source=source, model=model)
    if remove_wiki:
        for i in range(len(out)):
            out[i] = _remove_wiki(out[i])
    return out


def loads(string, dereify=None, remove_wiki=False):
    model = _get_model(dereify)
    out = loads_(string=string, model=model)
    if remove_wiki:
        for i in range(len(out)):
            out[i] = _remove_wiki(out[i])
    return out


def pm_encode(g, top=None, indent=-1, compact=False):
    model = amr_model
    return encode_(g=g, top=top, indent=indent, compact=compact, model=model)


def role_is_reverted(role: str):
    if role.endswith('consist-of'):
        return False
    return role.endswith('-of')


class AMRGraph(penman.Graph):
    def __str__(self):
        return penman.encode(self)


================================================
FILE: hanlp/components/amr/seq2seq/dataset/postprocessing.py
================================================
from collections import defaultdict, Counter
import enum
import re
import networkx as nx
import penman

from hanlp.components.amr.seq2seq.dataset.penman import pm_encode

BACKOFF = penman.Graph([
    penman.Triple('d2', ':instance', 'dog'),
    penman.Triple('b1', ':instance', 'bark-01'),
    penman.Triple('b1', ':ARG0', 'd2'), ])


def token_processing(tok):
    if tok is None:
        return None
    elif tok.isdigit():
        try:
            return eval(tok)
        except:
            return tok
    elif tok.startswith('"') and (not tok.endswith('"')):
        return tok + '"'
    elif tok.endswith('"') and (not tok.startswith('"')):
        return '"' + tok
    else:
        return tok


def decode_into_node_and_backreferences(subtoken_ids, tokenizer):
    rex_arg = re.compile(f"^{tokenizer.INIT}(op|snt|conj|prep)")
    rex_spc = re.compile(r"<(s|/s|lit|/lit|stop|unk|pad|mask)>")

    # get strings
    subtokens = tokenizer.convert_ids_to_tokens(subtoken_ids)
    # fix backreferences
    subtoken_backreferences = [max(t - len(tokenizer), -1) for t in subtoken_ids]
    # strip padding
    no_pad = [(s, b) for s, b in zip(subtokens, subtoken_backreferences) if s != (tokenizer.INIT + '<pad>')]
    if no_pad:
        subtokens, subtoken_backreferences = zip(*no_pad)
    else:
        subtokens, subtoken_backreferences = ['<s>'], [-1]

    # subword collapse
    tokens = []
    backreferences = []
    subword_to_token_map = {}
    current_token_i = 0
    for subw_i, (subw_backr, subtok) in enumerate(zip(subtoken_backreferences, subtokens)):
        subword_to_token_map[subw_i] = current_token_i

        # if empty you cannot do anything but add a new word
        if not tokens:
            tokens.append(subtok.lstrip(tokenizer.INIT))
            backreferences.append(-1)
            current_token_i += 1

        # backref can't be splitted
        elif subw_backr > -1:
            tokens.append(None)
            backreferences.append(subword_to_token_map[subw_backr])
            current_token_i += 1

        # after a special token release
        elif isinstance(tokens[-1], str) and rex_spc.match(tokens[-1]):
            tokens.append(subtok.lstrip(tokenizer.INIT))
            backreferences.append(-1)
            current_token_i += 1

        # after a subtoken ':' (which should be followed by the rest of the edge) ignore tokenizer.INIT
        # TODO: this is an ugly patch due to the fact that BART tokenizer splits after ':'
        elif (tokens[-1] == ':') and rex_arg.match(subtok):
            tokens[-1] = tokens[-1] + subtok[1:]

        # leading tokenizer.INIT
        elif subtok.startswith(tokenizer.INIT):
            tokens.append(subtok.lstrip(tokenizer.INIT))
            backreferences.append(-1)
            current_token_i += 1

        # very ugly patch for some cases in which tokenizer.INIT is not in the following token to the edge
        elif isinstance(tokens[-1], str) and tokens[-1].startswith(':') and tokens[-1][-1].isdigit() and (
                subtok != '-of'):
            tokens.append(subtok.lstrip(tokenizer.INIT))
            backreferences.append(-1)
            current_token_i += 1

        # in any other case attach to the previous
        else:
            tokens[-1] = tokens[-1] + subtok

    # strip INIT and fix byte-level
    tokens = [tokenizer.convert_tokens_to_string(list(t)).lstrip() if isinstance(t, str) else t for t in tokens]
    # tokens = [t.replace(tokenizer.INIT, '') if isinstance(t, str) else t for t in tokens]

    # unks are substituted with thing
    tokens = [t if t != '<unk>' else 'thing' for t in tokens]

    old_tokens = tokens
    old_backreferences = backreferences

    # <lit> Barack Obama </lit> -> "Barack Obama"
    tokens = []
    backreferences = []
    token_to_token_map = {}
    start_search = 0
    removed = 0
    while True:
        try:

            lit_start = old_tokens.index('<lit>', start_search)
            token_addition = old_tokens[start_search:lit_start]
            for i, t in enumerate(token_addition, start=start_search):
                token_to_token_map[i] = i - removed
            tokens += token_addition

            backreferences_addition = [token_to_token_map[b] if b > -1 else -1 for b in
                                       old_backreferences[start_search:lit_start]]
            backreferences += backreferences_addition

            lit_end = min(lit_start + 2, len(old_tokens) - 1)

            while lit_end < len(old_tokens):
                old_tok = old_tokens[lit_end]

                if isinstance(old_tok, str) and (
                        (old_tok.startswith(':') and len(old_tok) > 3) or (old_tok == '<stop>')):
                    res_tok = old_tokens[lit_start + 1:lit_end]
                    for i in range(lit_start, lit_end):
                        token_to_token_map[i] = len(tokens)

                    # Remove possible wrong None
                    res = old_tokens[lit_start + 1:lit_end]
                    res = [str(r) for r in res if r is not None]
                    res = '"' + '_'.join(res) + '"'

                    removed += len(res_tok)
                    start_search = lit_end
                    tokens += [res, old_tok]
                    backreferences += [-1, -1]
                    break

                elif old_tok == '</lit>':
                    res_tok = old_tokens[lit_start + 1:lit_end]
                    for i in range(lit_start, lit_end + 1):
                        token_to_token_map[i] = len(tokens)

                    # Remove possible wrong None
                    res = old_tokens[lit_start + 1:lit_end]
                    res = [str(r) for r in res if r is not None]
                    res = '"' + '_'.join(res) + '"'

                    removed += len(res_tok) + 1
                    start_search = lit_end + 1
                    tokens.append(res)
                    backreferences.append(-1)
                    break

                else:
                    lit_end += 1
                    start_search = lit_end

        except ValueError:
            token_addition = old_tokens[start_search:]
            for i, t in enumerate(token_addition, start=start_search):
                token_to_token_map[i] = i - removed
            backreferences_addition = [token_to_token_map[b] if b > -1 else b for b in
                                       old_backreferences[start_search:]]
            tokens += token_addition
            backreferences += backreferences_addition
            break

    tokens = [token_processing(t) for t in tokens]

    shift = 1
    if len(tokens) > 1 and tokens[1] == '<s>':
        shift = 2

    tokens = tokens[shift:]
    backreferences = [b if b == -1 else b - shift for b in backreferences[shift:]]

    if tokens and tokens[-1] == '</s>':
        tokens.pop()
        backreferences.pop()

    return tokens, backreferences


def decode_into_node_and_backreferences_without_space(subtoken_ids, tokenizer):
    rex_arg = re.compile(f"^{tokenizer.INIT}(op|snt|conj|prep)")
    rex_spc = re.compile(r"<(s|/s|lit|/lit|stop|unk|pad|mask)>")

    # get strings
    subtokens = tokenizer.convert_ids_to_tokens(subtoken_ids)
    # fix backreferences
    subtoken_backreferences = [max(t - len(tokenizer), -1) for t in subtoken_ids]
    # strip padding
    no_pad = [(s, b) for s, b in zip(subtokens, subtoken_backreferences) if s != (tokenizer.INIT + '<pad>')]
    if no_pad:
        subtokens, subtoken_backreferences = zip(*no_pad)
    else:
        subtokens, subtoken_backreferences = ['<s>'], [-1]

    # subword collapse
    tokens = []
    backreferences = []
    subword_to_token_map = {}
    current_token_i = 0
    prev_is_pointer = False
    prev_is_rel = False
    for subw_i, (subw_backr, subtok) in enumerate(zip(subtoken_backreferences, subtokens)):
        subword_to_token_map[subw_i] = current_token_i
        is_pointer = subtok.startswith('<pointer:') and subtok.endswith('>')
        is_rel = subtok.startswith(':') and len(subtok) > 1
        is_bracket = subtok in '()'

        # if empty you cannot do anything but add a new word
        if not tokens:
            tokens.append(subtok)
            backreferences.append(-1)
            current_token_i += 1

        # backref can't be splitted
        elif subw_backr > -1:
            tokens.append(None)
            backreferences.append(subword_to_token_map[subw_backr])
            current_token_i += 1

        # after a special token release
        elif isinstance(tokens[-1], str) and rex_spc.match(tokens[-1]):
            tokens.append(subtok)
            backreferences.append(-1)
            current_token_i += 1

        # after a subtoken ':' (which should be followed by the rest of the edge) ignore tokenizer.INIT
        # TODO: this is an ugly patch due to the fact that BART tokenizer splits after ':'
        elif (tokens[-1] == ':') and rex_arg.match(subtok):
            tokens[-1] = tokens[-1] + subtok[1:]

        # current or prev is a control token
        elif (is_pointer or is_rel or prev_is_pointer or prev_is_rel or is_bracket or subtok == '</s>') \
                and subtok != '-of':
            tokens.append(subtok)
            backreferences.append(-1)
            current_token_i += 1

        # very ugly patch for some cases in which tokenizer.INIT is not in the following token to the edge
        elif isinstance(tokens[-1], str) and tokens[-1].startswith(':') and tokens[-1][-1].isdigit() and (
                subtok != '-of'):
            tokens.append(subtok)
            backreferences.append(-1)
            current_token_i += 1

        # in any other case attach to the previous
        else:
            tokens[-1] = tokens[-1] + subtok

        prev_is_pointer = is_pointer
        prev_is_rel = is_rel

    # strip INIT and fix byte-level
    tokens = [tokenizer.convert_tokens_to_string(list(t)).lstrip() if isinstance(t, str) else t for t in tokens]
    # tokens = [t.replace(tokenizer.INIT, '') if isinstance(t, str) else t for t in tokens]

    # unks are substituted with thing
    tokens = [t if t != '<unk>' else 'thing' for t in tokens]

    old_tokens = tokens
    old_backreferences = backreferences

    # <lit> Barack Obama </lit> -> "Barack Obama"
    tokens = []
    backreferences = []
    token_to_token_map = {}
    start_search = 0
    removed = 0
    while True:
        try:

            lit_start = old_tokens.index('<lit>', start_search)
            token_addition = old_tokens[start_search:lit_start]
            for i, t in enumerate(token_addition, start=start_search):
                token_to_token_map[i] = i - removed
            tokens += token_addition

            backreferences_addition = [token_to_token_map[b] if b > -1 else -1 for b in
                                       old_backreferences[start_search:lit_start]]
            backreferences += backreferences_addition

            lit_end = min(lit_start + 2, len(old_tokens) - 1)

            while lit_end < len(old_tokens):
                old_tok = old_tokens[lit_end]

                if isinstance(old_tok, str) and (
                        (old_tok.startswith(':') and len(old_tok) > 3) or (old_tok == '<stop>')):
                    res_tok = old_tokens[lit_start + 1:lit_end]
                    for i in range(lit_start, lit_end):
                        token_to_token_map[i] = len(tokens)

                    # Remove possible wrong None
                    res = old_tokens[lit_start + 1:lit_end]
                    res = [str(r) for r in res if r is not None]
                    res = '"' + '_'.join(res) + '"'

                    removed += len(res_tok)
                    start_search = lit_end
                    tokens += [res, old_tok]
                    backreferences += [-1, -1]
                    break

                elif old_tok == '</lit>':
                    res_tok = old_tokens[lit_start + 1:lit_end]
                    for i in range(lit_start, lit_end + 1):
                        token_to_token_map[i] = len(tokens)

                    # Remove possible wrong None
                    res = old_tokens[lit_start + 1:lit_end]
                    res = [str(r) for r in res if r is not None]
                    res = '"' + '_'.join(res) + '"'

                    removed += len(res_tok) + 1
                    start_search = lit_end + 1
                    tokens.append(res)
                    backreferences.append(-1)
                    break

                else:
                    lit_end += 1
                    start_search = lit_end

        except ValueError:
            token_addition = old_tokens[start_search:]
            for i, t in enumerate(token_addition, start=start_search):
                token_to_token_map[i] = i - removed
            backreferences_addition = [token_to_token_map[b] if b > -1 else b for b in
                                       old_backreferences[start_search:]]
            tokens += token_addition
            backreferences += backreferences_addition
            break

    tokens = [token_processing(t) for t in tokens]

    shift = 0
    if len(tokens) > 1 and tokens[1] == '<s>':
        shift = 1

    tokens = tokens[shift:]
    backreferences = [b if b == -1 else b - shift for b in backreferences[shift:]]

    if tokens and tokens[-1] == '</s>':
        tokens.pop()
        backreferences.pop()

    return tokens, backreferences


def index_of(element, iterable, default=None, start=None, end=None):
    if not callable(element):
        def check(x):
            return element == x
    else:
        check = element
    if start is None:
        start = 0
    if end is None:
        end = len(iterable)
    item = start
    while item < end:
        if check(iterable[item]):
            return item
        item += 1
    return default


def separate_edges_nodes(edges_nodes_slice, *other):
    is_arg = lambda x: isinstance(x, str) and x.startswith(':')
    start = 0
    edges = []
    nodes = []
    l = len(edges_nodes_slice)
    while start < l:
        edge_index = index_of(
            is_arg,
            edges_nodes_slice,
            start=start)
        if edge_index is None or edge_index == (l - 1):
            break
        if is_arg(edges_nodes_slice[edge_index + 1]):
            start = edge_index + 1
            continue
        edges.append(edge_index)
        nodes.append(edge_index + 1)
        start = edge_index + 2
    ret = []
    for oth in other:
        edges_oth = [oth[i] for i in edges]
        nodes_oth = [oth[i] for i in nodes]
        ret.append((edges_oth, nodes_oth))
    return ret


def _split_name_ops(graph):
    # identify name triples
    name_vars = {}
    for i, (v1, rel, v2) in enumerate(graph.triples):
        if rel == ':instance' and v2 == 'name':
            name_vars[v1] = 1

    # check if they have ops
    name_vars_to_ops = defaultdict(list)
    for i, (v1, rel, v2) in enumerate(graph.triples):
        if v1 in name_vars and rel.startswith(':op'):
            name_vars_to_ops[v1].append((i, rel, v2.strip('"')))

    triples = graph.triples.copy()
    for nv, ops in name_vars_to_ops.items():
        ops = sorted(ops, key=lambda x: int(x[1][3:]))
        idx, _, lits = zip(*ops)
        for i in idx:
            triples[i] = None

        lits = ['"' + l + '"' for lit in lits for l in lit.split('_')]

        tt = []
        for i, l in enumerate(lits, start=1):
            rel = ':op' + str(i)
            tt.append(penman.Triple(nv, rel, l))

        triples[min(idx)] = tt

    triples = [t if isinstance(t, list) else [t] for t in triples if t is not None]
    triples = [t for tt in triples for t in tt]

    graph_ = penman.Graph(triples)
    graph_.metadata = graph.metadata
    return graph_


def _reconstruct_graph_from_nodes(nodes, backreferences):
    triples = []
    triples_added = set()

    variable2index = {}
    index2variable = {}
    start_index = 0

    cnt = defaultdict(Counter)

    while start_index < len(nodes):
        stop_index = index_of('<stop>', nodes, default=len(nodes) + 1, start=start_index)
        old_start_index = start_index
        start_index = stop_index + 1

        src_node, src_backr = nodes[old_start_index], backreferences[old_start_index]

        if src_node == '<stop>':
            continue

        trg_nodes_edges = nodes[old_start_index:stop_index]
        trg_nodes_edges_backr = backreferences[old_start_index:stop_index]
        trg_nodes_edges_indices = list(range(old_start_index, stop_index))

        if isinstance(src_node, str):
            if src_node in ('<s>', '</s>', '<stop>'):
                continue
            elif ('/' in src_node) or (':' in src_node) or ('(' in src_node) or (')' in src_node):
                src_node = 'thing'

        if src_node is not None:
            src_node = str(src_node)
            src_var = src_node[0].lower()
            if not src_var not in 'abcdefghijklmnopqrstuvwxyz':
                src_var = 'x'
            # src_var = f'{src_var}_{len(variable2index)}'
            src_var = f'{src_var}{len(variable2index)}'
            src_var_i = old_start_index
            variable2index[src_var] = src_var_i
            index2variable[src_var_i] = src_var
            triple = penman.Triple(src_var, ':instance', src_node)
            if triple not in triples_added:
                triples.append(triple)
                triples_added.add(triple)
        else:
            if src_backr in index2variable:
                src_var = index2variable[src_backr]
        # more resilient logic here
        (trg_edges, trg_nodes), (_, trg_nodes_backr), (_, trg_nodes_indices) = \
            separate_edges_nodes(
                trg_nodes_edges,
                trg_nodes_edges,
                trg_nodes_edges_backr,
                trg_nodes_edges_indices)

        for n, e, nb, ni in zip(trg_nodes, trg_edges, trg_nodes_backr, trg_nodes_indices):

            if isinstance(n, str) and n.startswith(':'):
                continue
            if isinstance(n, str) and n.startswith('<') and n.endswith('>'):
                continue
            if e == ':li':
                pass
            elif len(e) < 4 or (not e.startswith(':')):
                continue

            # same edge more than once
            num = cnt[src_var][e]
            # num = 0
            if num:

                if e.startswith(':op') or e.startswith(':snt'):
                    continue
                # elif e.startswith(':ARG'):
                #    continue
                elif num > 3:
                    continue

            if n is None:
                if nb not in index2variable:
                    continue
                trg_var = index2variable[nb]
                trg = trg_var
            elif e == ':mode':
                trg = n
            elif (not isinstance(n, str)) or re.match(r"^[+-]?\d+\.?\d*$", n) or (n == '-') or (n == '+'):
                trg = str(n)
            elif (n.startswith('"') and n.endswith('"') and len(n) > 2):
                trg = '"' + n.replace('"', '') + '"'
            elif ('/' in n) or (':' in n) or ('(' in n) or (')' in n) or ('=' in n):
                trg = f'"{n}"'
            elif n == '"':
                continue
            elif (n.startswith('"') and (not n.endswith('"'))) or (not n.startswith('"') and (n.endswith('"'))) or (
                    '"' in n):
                trg = '"' + n.replace('"', '') + '"'
            else:
                trg_var = n[0].lower()
                if trg_var not in 'abcdefghijklmnopqrstuvwxyz':
                    trg_var = 'x'
                # trg_var = f'{trg_var}_{len(variable2index)}'
                trg_var = f'{trg_var}{len(variable2index)}'
                trg_var_i = ni
                variable2index[trg_var] = trg_var_i
                index2variable[trg_var_i] = trg_var
                triple = penman.Triple(trg_var, ':instance', n)
                if triple not in triples_added:
                    triples.append(triple)
                    triples_added.add(triple)
                trg = trg_var

            triple = penman.Triple(src_var, e, trg)
            if triple not in triples_added:
                triples.append(triple)
                triples_added.add(triple)

            cnt[src_var][e] += 1

    return penman.Graph(triples)


def build_graph(nodes, backreferences, restore_name_ops=False):
    graph = _reconstruct_graph_from_nodes(nodes, backreferences)
    if restore_name_ops:
        graph = _split_name_ops(graph)
    return graph


class ParsedStatus(enum.Enum):
    OK = 0
    FIXED = 1
    BACKOFF = 2


def connect_graph_if_not_connected(graph):
    try:
        encoded = pm_encode(graph)
        return graph, ParsedStatus.OK
    except:
        pass

    nxgraph = nx.MultiGraph()
    variables = graph.variables()
    for v1, _, v2 in graph.triples:
        if v1 in variables and v2 in variables:
            nxgraph.add_edge(v1, v2)
        elif v1 in variables:
            nxgraph.add_edge(v1, v1)

    triples = graph.triples.copy()
    new_triples = []
    addition = f'a{len(variables) + 1}'
    triples.append(penman.Triple(addition, ':instance', 'and'))
    for i, conn_set in enumerate(nx.connected_components(nxgraph), start=1):
        edge = f':op{i}'
        conn_set = sorted(conn_set, key=lambda x: int(x[1:]))
        conn_set = [c for c in conn_set if c in variables]
        node = conn_set[0]
        new_triples.append(penman.Triple(addition, edge, node))
    triples = new_triples + triples
    metadata = graph.metadata
    graph = penman.Graph(triples)
    graph.metadata.update(metadata)
    pm_encode(graph)

    return graph, ParsedStatus.FIXED


def restore_backreferences_from_pointers(nodes):
    new_nodes, new_backreferences = [], []
    prev_pointer = None
    pointer2i = {}
    for n in nodes:
        is_pointer = isinstance(n, str) and n.startswith('<pointer:') and n.endswith('>')

        if not is_pointer:
            if prev_pointer is not None:
                if prev_pointer in pointer2i:
                    new_nodes.append(None)
                    new_backreferences.append(pointer2i[prev_pointer])
                    new_nodes.append(n)
                    new_backreferences.append(-1)

                else:
                    pointer2i[prev_pointer] = len(new_nodes)
                    new_nodes.append(n)
                    new_backreferences.append(-1)
            else:
                new_nodes.append(n)
                new_backreferences.append(-1)

            prev_pointer = None
        else:
            prev_pointer = n
    return new_nodes, new_backreferences


================================================
FILE: hanlp/components/amr/seq2seq/dataset/tokenization_bart.py
================================================
import copy
import sys
from typing import Set, Iterable

import penman
import regex as re
import torch
from transformers import BartTokenizer

from . import postprocessing
from .linearization import AMRTokens, AMRLinearizer
from .penman import pm_encode


class AMRBartTokenizer(BartTokenizer):
    ADDITIONAL = [
        AMRTokens.PNTR_N,
        AMRTokens.STOP_N,
        AMRTokens.LIT_START,
        AMRTokens.LIT_END,
        AMRTokens.BACKR_SRC_N,
        AMRTokens.BACKR_TRG_N, ]

    def __init__(self, *args, use_pointer_tokens=False, collapse_name_ops=False, INIT='Ġ', **kwargs):
        super().__init__(*args, **kwargs)
        self.INIT = INIT
        self.patterns = re.compile(
            r""" ?<[a-z]+:?\d*>| ?:[^\s]+|'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
        self.linearizer = AMRLinearizer(use_pointer_tokens=use_pointer_tokens, collapse_name_ops=collapse_name_ops)
        self.use_pointer_tokens = use_pointer_tokens
        self.collapse_name_ops = collapse_name_ops
        self.recategorizations = set()
        self.modified = 0

    @classmethod
    def from_pretrained(cls, pretrained_model_path, additional_tokens: Iterable[str] = None,
                        recategorization_tokens: Iterable[str] = None,
                        *args, **kwargs):
        inst = super().from_pretrained(pretrained_model_path, *args, **kwargs)
        inst.init_amr_vocabulary(additions=additional_tokens, recategorization_tokens=recategorization_tokens)
        return inst

    def init_amr_vocabulary(self, additions: Set[str] = None, recategorization_tokens: Iterable[str] = None):
        for tok in self.all_special_tokens:
            ntok = self.INIT + tok
            i = self.encoder[tok]
            self.decoder[i] = ntok
            del self.encoder[tok]
            self.encoder[ntok] = i

        tokens = []
        if additions:
            tokens.extend(additions)

        if recategorization_tokens:
            for tok in recategorization_tokens:
                if not tok.startswith('_'):
                    self.recategorizations.add(tok)
                tokens.append(tok)

        if self.use_pointer_tokens:
            for cnt in range(512):
                tokens.append(f"<pointer:{cnt}>")

        tokens += self.ADDITIONAL
        tokens = [self.INIT + t if t[0] not in ('_', '-') else t for t in tokens]
        tokens = [t for t in tokens if t not in self.encoder]
        self.old_enc_size = old_enc_size = len(self.encoder)
        for i, t in enumerate(tokens, start=old_enc_size):
            self.encoder[t] = i

        self.encoder = {k: i for i, (k, v) in enumerate(sorted(self.encoder.items(), key=lambda x: x[1]))}
        self.decoder = {v: k for k, v in sorted(self.encoder.items(), key=lambda x: x[1])}
        self.modified = len(tokens)

        self.bos_token = self.INIT + self.bos_token
        self.pad_token = self.INIT + self.pad_token
        self.eos_token = self.INIT + self.eos_token
        self.unk_token = self.INIT + self.unk_token

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
        if token_ids_1 is None:
            return output
        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]

    def _tokenize(self, text):
        """ Tokenize a string. Modified in order to handle sentences with recategorization pointers"""
        bpe_tokens = []
        for tok_span in text.lstrip().split(' '):
            tok_span = tok_span.strip()
            recats = tok_span.rsplit('_', 1)
            if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder:
                bpe_tokens.extend([self.INIT + recats[0], '_' + recats[1]])
            else:
                for token in re.findall(self.pat, ' ' + tok_span):
                    token = "".join(
                        self.byte_encoder[b] for b in token.encode("utf-8")
                    )  # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
                    bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))

        return bpe_tokens

    def _tok_bpe(self, token, add_space=True):
        # if add_space:
        #     token = ' ' + token.lstrip()
        tokk = []
        tok = token.strip()
        recats = tok.rsplit('_', 1)
        if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder:
            tokk.extend([self.INIT + recats[0], '_' + recats[1]])
        else:
            for tok in self.patterns.findall(' ' + token):
                tok = "".join(
                    self.byte_encoder[b] for b in tok.encode("utf-8"))
                toks = self.bpe(tok).split(' ')
                tokk.extend(toks)
        return tokk

    def _get_nodes_and_backreferences(self, graph):
        lin = self.linearizer.linearize(graph)
        linearized_nodes, backreferences = lin.nodes, lin.backreferences
        return linearized_nodes, backreferences

    def tokenize_amr(self, graph):
        linearized_nodes, backreferences = self._get_nodes_and_backreferences(graph)

        bpe_tokens = []
        bpe_backreferences = []
        counter = 0

        for i, (backr, tokk) in enumerate(zip(backreferences, linearized_nodes)):
            is_in_enc = self.INIT + tokk in self.encoder
            is_rel = tokk.startswith(':') and len(tokk) > 1
            is_spc = tokk.startswith('<') and tokk.endswith('>')
            is_of = tokk.startswith(':') and tokk.endswith('-of')
            is_frame = re.match(r'.+-\d\d', tokk) is not None

            if tokk.startswith('"') and tokk.endswith('"'):
                tokk = tokk[1:-1].replace('_', ' ')
                bpe_toks = [self.INIT + AMRTokens.LIT_START]
                bpe_toks += self._tok_bpe(tokk, add_space=True)
                bpe_toks.append(self.INIT + AMRTokens.LIT_END)

            elif (is_rel or is_spc or is_frame or is_of):
                if is_in_enc:
                    bpe_toks = [self.INIT + tokk]
                elif is_frame:
                    bpe_toks = self._tok_bpe(tokk[:-3], add_space=True) + [tokk[-3:]]
                elif is_of:
                    rel = tokk[:-3]
                    if self.INIT + rel in self.encoder:
                        bpe_toks = [self.INIT + rel, '-of']
                    else:
                        bpe_toks = [self.INIT + ':'] + self._tok_bpe(rel[1:], add_space=True) + ['-of']
                elif is_rel:
                    bpe_toks = [self.INIT + ':'] + self._tok_bpe(tokk[1:], add_space=True)
                else:
                    raise

            else:
                if is_in_enc:
                    bpe_toks = [self.INIT + tokk]
                else:
                    bpe_toks = self._tok_bpe(tokk, add_space=True)

            bpe_tokens.append(bpe_toks)

            if i == backr:
                bpe_backr = list(range(counter, counter + len(bpe_toks)))
                counter += len(bpe_toks)
                bpe_backreferences.append(bpe_backr)
            else:
                bpe_backreferences.append(bpe_backreferences[backr][0:1])
                counter += 1
        bpe_tokens = [b for bb in bpe_tokens for b in bb]
        bpe_token_ids = [self.encoder.get(b, self.unk_token_id) for b in bpe_tokens]
        bpe_backreferences = [b for bb in bpe_backreferences for b in bb]
        return bpe_tokens, bpe_token_ids, bpe_backreferences

    def batch_encode_sentences(self, sentences, device=torch.device('cpu')):
        sentences = [s for s in sentences]
        extra = {'sentences': sentences}
        batch = super().batch_encode_plus(sentences, return_tensors='pt', pad_to_max_length=True)
        batch = {k: v.to(device) for k, v in batch.items()}
        return batch, extra

    def linearize(self, graph):
        shift = len(self.encoder)
        tokens, token_ids, backreferences = self.tokenize_amr(graph)
        extra = {'linearized_graphs': tokens, 'graphs': graph}
        token_uni_ids = \
            [idx if i == b else b + shift for i, (idx, b) in enumerate(zip(token_ids, backreferences))]
        if token_uni_ids[-1] != (self.INIT + AMRTokens.EOS_N):
            tokens.append(self.INIT + AMRTokens.EOS_N)
            token_ids.append(self.eos_token_id)
            token_uni_ids.append(self.eos_token_id)
            backreferences.append(len(backreferences))
        return token_uni_ids, extra

    def batch_encode_graphs(self, graphs, device=torch.device('cpu')):
        linearized, extras = zip(*[self.linearize(g) for g in graphs])
        return self.batch_encode_graphs_from_linearized(linearized, extras, device=device)

    def batch_encode_graphs_from_linearized(self, linearized, extras=None, device=torch.device('cpu')):
        if extras is not None:
            batch_extra = {'linearized_graphs': [], 'graphs': []}
            for extra in extras:
                batch_extra['graphs'].append(extra['graphs'])
                batch_extra['linearized_graphs'].append(extra['linearized_graphs'])
        else:
            batch_extra = {}
        maxlen = 0
        batch = []
        for token_uni_ids in linearized:
            maxlen = max(len(token_uni_ids), maxlen)
            batch.append(token_uni_ids)
        batch = [x + [self.pad_token_id] * (maxlen - len(x)) for x in batch]
        batch = torch.tensor(batch).to(device)
        batch = {'decoder_input_ids': batch[:, :-1], 'lm_labels': batch[:, 1:]}
        return batch, batch_extra

    def decode_amr(self, tokens, restore_name_ops=False):
        try:
            nodes, backreferences = postprocessing.decode_into_node_and_backreferences(tokens, self)
        except Exception as e:
            print('Decoding failure:', file=sys.stderr)
            print(e, file=sys.stderr)
            return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
        if self.use_pointer_tokens:
            nodes, backreferences = postprocessing.restore_backreferences_from_pointers(nodes)
        try:
            graph_ = graph = postprocessing.build_graph(nodes, backreferences, restore_name_ops=restore_name_ops)
        except Exception as e:
            print('Building failure:', file=sys.stderr)
            print(nodes, file=sys.stderr)
            print(backreferences, file=sys.stderr)
            print(e, file=sys.stderr)
            return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
        try:
            graph, status = postprocessing.connect_graph_if_not_connected(graph)
            if status == postprocessing.ParsedStatus.BACKOFF:
                print('Reconnection 1 failure:')
                print(nodes, file=sys.stderr)
                print(backreferences, file=sys.stderr)
                print(graph_, file=sys.stderr)
            return graph, status, (nodes, backreferences)
        except Exception as e:
            print('Reconnction 2 failure:', file=sys.stderr)
            print(e, file=sys.stderr)
            print(nodes, file=sys.stderr)
            print(backreferences, file=sys.stderr)
            print(graph_, file=sys.stderr)
            return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (nodes, backreferences)


class PENMANBartTokenizer(AMRBartTokenizer):

    def __init__(self, *args, raw_graph=False, **kwargs):
        super().__init__(*args, **kwargs)
        self.linearizer = None
        self.remove_pars = False
        self.raw_graph = raw_graph

    def _tokenize_encoded_graph(self, encoded):
        linearized = re.sub(r"(\".+?\")", r' \1 ', encoded)
        pieces = []
        for piece in linearized.split():
            if piece.startswith('"') and piece.endswith('"'):
                pieces.append(piece)
            else:
                piece = piece.replace('(', ' ( ')
                piece = piece.replace(')', ' ) ')
                piece = piece.replace(':', ' :')
                piece = piece.replace('/', ' / ')
                piece = piece.strip()
                pieces.append(piece)
        linearized = re.sub(r'\s+', ' ', ' '.join(pieces)).strip()
        linearized_nodes = [AMRTokens.BOS_N] + linearized.split(' ')
        return linearized_nodes

    def tokenize_amr(self, graph):
        if self.raw_graph:
            graph_ = copy.deepcopy(graph)
            graph_.metadata = {}
            linearized = penman.encode(graph_)
            linearized = re.sub(r"\s+", ' ', linearized)
            bpe_tokens = [self.bos_token] + self._tokenize(linearized)[:1022]
            bpe_token_ids = [self.encoder.get(b, self.unk_token_id) for b in bpe_tokens]
            bpe_backreferences = list(range(len(bpe_token_ids)))
            return bpe_tokens, bpe_token_ids, bpe_backreferences
        else:
            return super().tokenize_amr(graph)

    def _get_nodes_and_backreferences(self, graph):
        graph_ = copy.deepcopy(graph)
        graph_.metadata = {}
        linearized = penman.encode(graph_)
        linearized_nodes = self._tokenize_encoded_graph(linearized)

        if self.use_pointer_tokens:
            remap = {}
            for i in range(1, len(linearized_nodes)):
                nxt = linearized_nodes[i]
                lst = linearized_nodes[i - 1]
                if nxt == '/':
                    remap[lst] = f'<pointer:{len(remap)}>'
            i = 1
            linearized_nodes_ = [linearized_nodes[0]]
            while i < (len(linearized_nodes)):
                nxt = linearized_nodes[i]
                lst = linearized_nodes_[-1]
                if nxt in remap:
                    if lst == '(' and linearized_nodes[i + 1] == '/':
                        nxt = remap[nxt]
                        i += 1
                    elif lst.startswith(':'):
                        nxt = remap[nxt]
                linearized_nodes_.append(nxt)
                i += 1
            linearized_nodes = linearized_nodes_
            if self.remove_pars:
                linearized_nodes = [n for n in linearized_nodes if n != '(']
        backreferences = list(range(len(linearized_nodes)))
        return linearized_nodes, backreferences

    def _classify(self, node):
        if not isinstance(node, str):
            return "CONST"
        elif node == 'i':
            return "I"
        elif re.match(r'^[a-z]\d*$', node) is not None:
            return "VAR"
        elif node[0].isdigit():
            return "CONST"
        elif node.startswith('"') and node.endswith('"'):
            return "CONST"
        elif node in ('+', '-'):
            return "CONST"
        elif node == ':mode':
            return 'MODE'
        elif node.startswith(':'):
            return "EDGE"
        elif node in ['/', '(', ')']:
            return node
        elif node[0].isalpha():
            for char in (',', ':', '/', '(', ')', '.', '!', '?', '\\'):
                if char in node:
                    return "CONST"
            return "INST"
        else:
            return 'CONST'

    def _fix_and_make_graph(self, nodes):

        nodes_ = []
        for n in nodes:
            if isinstance(n, str):
                if n.startswith('<') and n.endswith('>') and (not n.startswith('<pointer:')):
                    pass
                else:
                    nodes_.append(n)
            else:
                nodes_.append(n)
        nodes = nodes_

        if self.use_pointer_tokens:

            i = 0
            nodes_ = []
            while i < len(nodes):
                nxt = nodes[i]
                pst = None
                if isinstance(nxt, str) and nxt.startswith('<pointer:'):
                    e = nxt.find('>')
                    if e != len(nxt) - 1:
                        pst = nxt[e + 1:]
                        nxt = nxt[:e + 1]
                    nodes_.append(nxt)
                    if pst is not None:
                        nodes_.append(pst)
                else:
                    nodes_.append(nxt)
                i += 1
            nodes = nodes_

            i = 1
            nodes_ = [nodes[0]]
            while i < len(nodes):
                nxt = nodes[i]
                if isinstance(nxt, str) and nxt.startswith('<pointer:'):
                    nxt = 'z' + nxt[9:-1]
                    fol = nodes[i + 1]
                    # is not expansion
                    if isinstance(fol, str) and (fol.startswith(':') or (fol == ')')):
                        nodes_.append(nxt)
                    else:
                        if self.remove_pars:
                            nodes_.append('(')
                        else:
                            if nodes_[-1] != '(':
                                nodes_.append('(')
                                # pass
                        nodes_.append(nxt)
                        nodes_.append('/')
                else:
                    nodes_.append(nxt)
                i += 1
            nodes = nodes_

        i = 0
        nodes_ = []
        while i < (len(nodes) - 1):
            if nodes[i] == ':':
                nodes_.append(nodes[i] + nodes[i + 1])
                i += 2
                last = False
            else:
                nodes_.append(nodes[i])
                i += 1
                last = True
        if last:
            nodes_.append(nodes[-1])
        nodes = nodes_

        i = 0
        nodes_ = []
        while i < (len(nodes)):
            if i < 2:
                nodes_.append(nodes[i])
                i += 1
            elif nodes_[-2] == '/' and nodes[i] == '/':
                i += 2
            else:
                nodes_.append(nodes[i])
                i += 1
        nodes = nodes_

        i = 0
        newvars = 0
        variables = set()
        remap = {}
        nodes_ = []
        while i < (len(nodes)):

            next = nodes[i]

            if next == '/':
                last = nodes_[-1]
                if last in variables:
                    last_remap = f"x{newvars + 1000}"
                    newvars += 1
                    nodes_[-1] = last_remap
                    remap[last] = last_remap
                variables.add(last)
                nodes_.append(next)

            elif self._classify(next) == 'VAR' and next in remap and (i < len(nodes) - 1) and nodes[i + 1] != '/':
                next = remap[next]
                nodes_.append(next)

            else:
                nodes_.append(next)

            i += 1

        nodes = nodes_
        pieces_ = []
        open_cnt = 0
        closed_cnt = 0
        if nodes[0] != '(':
            pieces_.append('(')
            open_cnt += 1
        for p in nodes:
            if p == '(':
                open_cnt += 1
            elif p == ')':
                closed_cnt += 1
            pieces_.append(p)
            if open_cnt == closed_cnt:
                break
        nodes = pieces_ + [')'] * (open_cnt - closed_cnt)

        pieces = []
        for piece in nodes:
            if not pieces:
                pieces.append('(')
            else:
                piece = str(piece)
                if piece.startswith('"') or piece.startswith('"') or '"' in piece.strip('"'):
                    piece = '"' + piece.replace('"', '') + '"'

                prev = self._classify(pieces[-1])
                next = self._classify(piece)

                if next == 'CONST':
                    quote = False
                    for char in (',', ':', '/', '(', ')', '.', '!', '?', '\\', '_', '='):
                        if char in piece:
                            quote = True
                            break
                    if quote:
                        piece = '"' + piece.strip('"') + '"'

                if prev == '(':
                    if next in ('VAR', 'I'):
                        pieces.append(piece)
                elif prev == ')':
                    if next in (')', 'EDGE', 'MODE'):
                        pieces.append(piece)
                elif prev == 'VAR':
                    if next in ('/', 'EDGE', 'MODE', ')'):
                        pieces.append(piece)
                elif prev == '/':
                    if next in ('INST', 'I'):
                        pieces.append(piece)
                elif prev == 'INST':
                    if next in (')', 'EDGE', 'MODE'):
                        pieces.append(piece)
                elif prev == 'I':
                    if next in ('/', ')', 'EDGE', 'MODE'):
                        pieces.append(piece)
                elif prev == 'EDGE':
                    if next in ('(', 'VAR', 'CONST', 'I'):
                        pieces.append(piece)
                    elif next == ')':
                        pieces[-1] = piece
                    elif next in ('EDGE', 'MODE'):
                        pieces[-1] = piece
                elif prev == 'MODE':
                    if next == 'INST':
                        pieces.append(piece)
                elif prev == 'CONST':
                    if next in (')', 'EDGE', 'MODE'):
                        pieces.append(piece)

        pieces_ = []
        open_cnt = 0
        closed_cnt = 0
        if pieces[0] != '(':
            pieces_.append('(')
            open_cnt += 1
        for p in pieces:
            if p == '(':
                open_cnt += 1
            elif p == ')':
                closed_cnt += 1
            pieces_.append(p)
            if open_cnt == closed_cnt:
                break
        pieces = pieces_ + [')'] * (open_cnt - closed_cnt)

        linearized = re.sub(r'\s+', ' ', ' '.join(pieces)).strip()

        """
        line = linearized
        # make sure parentheses match
        # copied from https://github.com/RikVN/AMR/blob/master/restoreAMR/restore_amr.py
        open_count = 0
        close_count = 0
        for i, c in enumerate(line):
            if c == '(':
                open_count += 1
            elif c == ')':
                close_count += 1
            if open_count == close_count and open_count > 0:
                line = line[:i].strip()
                break
        old_line = line
        while True:
            open_count = len(re.findall(r'\(', line))
            close_count = len(re.findall(r'\)', line))
            if open_count > close_count:
                line += ')' * (open_count - close_count)
            elif close_count > open_count:
                for i in range(close_count - open_count):
                    line = line.rstrip(')')
                    line = line.rstrip(' ')
            if old_line == line:
                break
            old_line = line
        """

        graph = penman.decode(linearized + ' ')
        triples = []
        newvars = 2000
        for triple in graph.triples:
            x, rel, y = triple
            if x is None:
                pass
            elif rel == ':instance' and y is None:
                triples.append(penman.Triple(x, rel, 'thing'))
            elif y is None:
                var = f'x{newvars}'
                newvars += 1
                triples.append(penman.Triple(x, rel, var))
                triples.append(penman.Triple(var, ':instance', 'thing'))
            else:
                triples.append(triple)
        graph = penman.Graph(triples)
        linearized = pm_encode(graph)

        def fix_text(linearized=linearized):
            n = 0

            def _repl1(match):
                nonlocal n
                out = match.group(1) + match.group(2) + str(3000 + n) + ' / ' + match.group(2) + match.group(3)
                n += 1
                return out

            linearized = re.sub(r'(\(\s?)([a-z])([^\/:\)]+[:\)])', _repl1, linearized,
                                flags=re.IGNORECASE | re.MULTILINE)

            def _repl2(match):
                return match.group(1)

            linearized = re.sub(r'(\(\s*[a-z][\d+]\s*\/\s*[^\s\)\(:\/]+\s*)((?:/\s*[^\s\)\(:\/]+\s*)+)', _repl2,
                                linearized,
                                flags=re.IGNORECASE | re.MULTILINE)

            # adds a ':' to args w/o it
            linearized = re.sub(r'([^:])(ARG)', r'\1 :\2', linearized)

            # removes edges with no node
            # linearized = re.sub(r':[^\s\)\(:\/]+?\s*\)', ')', linearized, flags=re.MULTILINE)

            return linearized

        linearized = fix_text(linearized)

        g = penman.decode(linearized)
        return g

    def decode_amr(self, tokens, restore_name_ops=None):
        try:
            if self.raw_graph:
                nodes = self._tokenize_encoded_graph(self.decode(tokens))
                backreferences = list(range(len(nodes)))
            else:
                nodes, backreferences = postprocessing.decode_into_node_and_backreferences(tokens, self)
            nodes_ = nodes
        except Exception as e:
            print('Decoding failure:', file=sys.stderr)
            print(e, file=sys.stderr)
            return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
        try:
            graph_ = graph = self._fix_and_make_graph(nodes)
            if self.collapse_name_ops:
                graph_ = graph = postprocessing._split_name_ops(graph)
        except Exception as e:
            print('Building failure:', file=sys.stderr)
            print(nodes, file=sys.stderr)
            print(backreferences, file=sys.stderr)
            print(e, file=sys.stderr)
            return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
        try:
            graph, status = postprocessing.connect_graph_if_not_connected(graph)
            if status == postprocessing.ParsedStatus.BACKOFF:
                print('Reconnection 1 failure:')
                print(nodes, file=sys.stderr)
                print(backreferences, file=sys.stderr)
                print(graph_, file=sys.stderr)
            return graph, status, (nodes_, backreferences)
        except Exception as e:
            print('Reconnction 2 failure:', file=sys.stderr)
            print(e, file=sys.stderr)
            print(nodes, file=sys.stderr)
            print(backreferences, file=sys.stderr)
            print(graph_, file=sys.stderr)
            return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (nodes_, backreferences)


================================================
FILE: hanlp/components/amr/seq2seq/dataset/tokenization_t5.py
================================================
import copy
import sys
from typing import Set, Iterable, Dict

import penman
import regex as re
import torch
import traceback
from transformers import T5Tokenizer, T5TokenizerFast

from . import postprocessing
from .linearization import AMRTokens, AMRLinearizer
from .penman import pm_encode


class AMRT5Tokenizer(T5TokenizerFast):
    ADDITIONAL = [
        AMRTokens.PNTR_N,
        AMRTokens.STOP_N,
        AMRTokens.LIT_START,
        AMRTokens.LIT_END,
        AMRTokens.BACKR_SRC_N,
        AMRTokens.BACKR_TRG_N, ]

    def __init__(self, *args, use_pointer_tokens=False, collapse_name_ops=False, INIT='', **kwargs):
        super().__init__(*args, **kwargs)
        self.INIT = INIT
        self.patterns = re.compile(
            r""" ?<[a-z]+:?\d*>| ?:[^\s]+|'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
        self.linearizer = AMRLinearizer(use_pointer_tokens=use_pointer_tokens, collapse_name_ops=collapse_name_ops)
        self.use_pointer_tokens = use_pointer_tokens
        self.collapse_name_ops = collapse_name_ops
        self.recategorizations = set()
        self.modified = 0

    @classmethod
    def from_pretrained(cls, pretrained_model_path, additional_tokens: Iterable[str] = None,
                        recategorization_tokens: Iterable[str] = None,
                        *args, **kwargs):
        inst = super().from_pretrained(pretrained_model_path, *args, **kwargs)
        inst.init_amr_vocabulary(additions=additional_tokens, recategorization_tokens=recategorization_tokens)
        return inst

    def init_amr_vocabulary(self, additions: Set[str] = None, recategorization_tokens: Iterable[str] = None):
        # T5 has no encoder but it's not a problem for Chinese
        # for tok in self.all_special_tokens:
        #     ntok = self.INIT + tok
        #     i = self.encoder[tok]
        #     self.decoder[i] = ntok
        #     del self.encoder[tok]
        #     self.encoder[ntok] = i

        tokens = [AMRTokens.BOS_N]
        if additions:
            tokens.extend(additions)

        if recategorization_tokens:
            for tok in recategorization_tokens:
                if not tok.startswith('_'):
                    self.recategorizations.add(tok)
                tokens.append(tok)

        if self.use_pointer_tokens:
            for cnt in range(512):
                tokens.append(f"<pointer:{cnt}>")

        tokens += self.ADDITIONAL
        tokens = [self.INIT + t if t[0] not in ('_', '-') else t for t in tokens]
        self.old_enc_size = len(self)
        self.add_tokens(tokens)
        self.modified = len(tokens)

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
        if token_ids_1 is None:
            return output
        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]

    def _tokenize(self, text):
        """ Tokenize a string. Modified in order to handle sentences with recategorization pointers"""
        bpe_tokens = []
        for tok_span in text.lstrip().split(' '):
            tok_span = tok_span.strip()
            recats = tok_span.rsplit('_', 1)
            if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder:
                bpe_tokens.extend([self.INIT + recats[0], '_' + recats[1]])
            else:
                for token in re.findall(self.pat, ' ' + tok_span):
                    token = "".join(
                        self.byte_encoder[b] for b in token.encode("utf-8")
                    )  # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
                    bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))

        return bpe_tokens

    def _tok_bpe(self, token, add_space=True):
        # if add_space:
        #     token = ' ' + token.lstrip()
        tokk = []
        tok = token.strip()
        recats = tok.rsplit('_', 1)
        if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder:
            tokk.extend([self.INIT + recats[0], '_' + recats[1]])
        else:
            for tok in self.patterns.findall(token):
                tokk.extend(self.tokenize(tok))
        return tokk

    def _get_nodes_and_backreferences(self, graph):
        lin = self.linearizer.linearize(graph)
        linearized_nodes, backreferences = lin.nodes, lin.backreferences
        return linearized_nodes, backreferences

    def tokenize_amr(self, graph):
        linearized_nodes, backreferences = self._get_nodes_and_backreferences(graph)

        bpe_tokens = []
        bpe_backreferences = []
        counter = 0

        encoder = self.encoder
        for i, (backr, tokk) in enumerate(zip(backreferences, linearized_nodes)):
            is_in_enc = self.INIT + tokk in encoder
            is_rel = tokk.startswith(':') and len(tokk) > 1
            is_spc = tokk.startswith('<') and tokk.endswith('>')
            is_of = tokk.startswith(':') and tokk.endswith('-of')
            is_frame = re.match(r'.+-\d\d', tokk) is not None

            if tokk.startswith('"') and tokk.endswith('"'):
                tokk = tokk[1:-1].replace('_', ' ')
                bpe_toks = [self.INIT + AMRTokens.LIT_START]
                bpe_toks += self._tok_bpe(tokk, add_space=True)
                bpe_toks.append(self.INIT + AMRTokens.LIT_END)

            elif (is_rel or is_spc or is_frame or is_of):
                if is_in_enc:
                    bpe_toks = [self.INIT + tokk]
                elif is_frame:
                    bpe_toks = self._tok_bpe(tokk[:-3], add_space=True) + [tokk[-3:]]
                elif is_of:
                    rel = tokk[:-3]
                    if self.INIT + rel in encoder:
                        bpe_toks = [self.INIT + rel, '-of']
                    else:
                        bpe_toks = [self.INIT + ':'] + self._tok_bpe(rel[1:], add_space=True) + ['-of']
                elif is_rel:
                    bpe_toks = [self.INIT + ':'] + self._tok_bpe(tokk[1:], add_space=True)
                else:
                    raise

            else:
                if is_in_enc:
                    bpe_toks = [self.INIT + tokk]
                else:
                    bpe_toks = self._tok_bpe(tokk, add_space=True)

            bpe_tokens.append(bpe_toks)

            if i == backr:
                bpe_backr = list(range(counter, counter + len(bpe_toks)))
                counter += len(bpe_toks)
                bpe_backreferences.append(bpe_backr)
            else:
                bpe_backreferences.append(bpe_backreferences[backr][0:1])
                counter += 1
        bpe_tokens = [b for bb in bpe_tokens for b in bb]
        bpe_token_ids = self.convert_tokens_to_ids(bpe_tokens)
        bpe_backreferences = [b for bb in bpe_backreferences for b in bb]
        return bpe_tokens, bpe_token_ids, bpe_backreferences

    def batch_encode_sentences(self, sentences, device=torch.device('cpu')):
        sentences = [s for s in sentences]
        extra = {'sentences': sentences}
        batch = super().batch_encode_plus(sentences, return_tensors='pt', pad_to_max_length=True)
        batch = {k: v.to(device) for k, v in batch.items()}
        return batch, extra

    def linearize(self, graph):
        shift = len(self)
        tokens, token_ids, backreferences = self.tokenize_amr(graph)
        extra = {'linearized_graphs': tokens, 'graphs': graph}
        token_uni_ids = \
            [idx if i == b else b + shift for i, (idx, b) in enumerate(zip(token_ids, backreferences))]
        if token_uni_ids[-1] != (self.INIT + AMRTokens.EOS_N):
            tokens.append(self.INIT + AMRTokens.EOS_N)
            token_ids.append(self.eos_token_id)
            token_uni_ids.append(self.eos_token_id)
            backreferences.append(len(backreferences))
        return token_uni_ids, extra

    def batch_encode_graphs(self, graphs, device=torch.device('cpu')):
        linearized, extras = zip(*[self.linearize(g) for g in graphs])
        return self.batch_encode_graphs_from_linearized(linearized, extras, device=device)

    def batch_encode_graphs_from_linearized(self, linearized, extras=None, device=torch.device('cpu')):
        if extras is not None:
            batch_extra = {'linearized_graphs': [], 'graphs': []}
            for extra in extras:
                batch_extra['graphs'].append(extra['graphs'])
                batch_extra['linearized_graphs'].append(extra['linearized_graphs'])
        else:
            batch_extra = {}
        maxlen = 0
        batch = []
        for token_uni_ids in linearized:
            maxlen = max(len(token_uni_ids), maxlen)
            batch.append(token_uni_ids)
        batch = [x + [self.pad_token_id] * (maxlen - len(x)) for x in batch]
        batch = torch.tensor(batch).to(device)
        batch = {'decoder_input_ids': batch[:, :-1], 'lm_labels': batch[:, 1:]}
        return batch, batch_extra

    def decode_amr(self, tokens, restore_name_ops=False):
        try:
            nodes, backreferences = postprocessing.decode_into_node_and_backreferences(tokens, self)
        except Exception as e:
            print('Decoding failure:', file=sys.stderr)
            traceback.print_exc()
            return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
        if self.use_pointer_tokens:
            nodes, backreferences = postprocessing.restore_backreferences_from_pointers(nodes)
        try:
            graph_ = graph = postprocessing.build_graph(nodes, backreferences, restore_name_ops=restore_name_ops)
        except Exception as e:
            print('Building failure:', file=sys.stderr)
            traceback.print_exc()
            print(nodes, file=sys.stderr)
            print(backreferences, file=sys.stderr)
            print(e, file=sys.stderr)
            return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
        try:
            graph, status = postprocessing.connect_graph_if_not_connected(graph)
            if status == postprocessing.ParsedStatus.BACKOFF:
                print('Reconnection 1 failure:')
                print(nodes, file=sys.stderr)
                print(backreferences, file=sys.stderr)
                print(graph_, file=sys.stderr)
            return graph, status, (nodes, backreferences)
        except Exception as e:
            print('Reconnction 2 failure:', file=sys.stderr)
            traceback.print_exc()
            print(nodes, file=sys.stderr)
            print(backreferences, file=sys.stderr)
            print(graph_, file=sys.stderr)
            return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (nodes, backreferences)


class PENMANT5Tokenizer(AMRT5Tokenizer):

    def __init__(self, *args, raw_graph=False, **kwargs):
        super().__init__(*args, **kwargs)
        self.linearizer = None
        self.remove_pars = False
        self.raw_graph = raw_graph

    def _tokenize_encoded_graph(self, encoded):
        linearized = re.sub(r"(\".+?\")", r' \1 ', encoded)
        pieces = []
        for piece in linearized.split():
            if piece.startswith('"') and piece.endswith('"'):
                pieces.append(piece)
            else:
                piece = piece.replace('(', ' ( ')
                piece = piece.replace(')', ' ) ')
                piece = piece.replace(':', ' :')
                piece = piece.replace('/', ' / ')
                piece = piece.strip()
                pieces.append(piece)
        linearized = re.sub(r'\s+', ' ', ' '.join(pieces)).strip()
        # T5 uses pad instead of <s>
        # linearized_nodes = [AMRTokens.BOS_N] + linearized.split(' ')
        linearized_nodes = [self.pad_token] + linearized.split(' ')
        return linearized_nodes

    def tokenize_amr(self, graph):
        if self.raw_graph:
            graph_ = copy.deepcopy(graph)
            graph_.metadata = {}
            linearized = penman.encode(graph_)
            linearized = re.sub(r"\s+", ' ', linearized)
            bpe_tokens = [self.bos_token] + self._tokenize(linearized)[:1022]
            bpe_token_ids = [self.encoder.get(b, self.unk_token_id) for b in bpe_tokens]
            bpe_backreferences = list(range(len(bpe_token_ids)))
            return bpe_tokens, bpe_token_ids, bpe_backreferences
        else:
            return super().tokenize_amr(graph)

    def _get_nodes_and_backreferences(self, graph):
        graph_ = copy.deepcopy(graph)
        graph_.metadata = {}
        linearized = penman.encode(graph_)
        linearized_nodes = self._tokenize_encoded_graph(linearized)

        if self.use_pointer_tokens:
            remap = {}
            for i in range(1, len(linearized_nodes)):
                nxt = linearized_nodes[i]
                lst = linearized_nodes[i - 1]
                if nxt == '/':
                    remap[lst] = f'<pointer:{len(remap)}>'
            i = 1
            linearized_nodes_ = [linearized_nodes[0]]
            while i < (len(linearized_nodes)):
                nxt = linearized_nodes[i]
                lst = linearized_nodes_[-1]
                if nxt in remap:
                    if lst == '(' and linearized_nodes[i + 1] == '/':
                        nxt = remap[nxt]
                        i += 1
                    elif lst.startswith(':'):
                        nxt = remap[nxt]
                linearized_nodes_.append(nxt)
                i += 1
            linearized_nodes = linearized_nodes_
            if self.remove_pars:
                linearized_nodes = [n for n in linearized_nodes if n != '(']
        backreferences = list(range(len(linearized_nodes)))
        return linearized_nodes, backreferences

    def _classify(self, node):
        if not isinstance(node, str):
            return "CONST"
        elif node == 'i':
            return "I"
        elif re.match(r'^[a-z]\d*$', node) is not None:
            return "VAR"
        elif node[0].isdigit():
            return "CONST"
        elif node.startswith('"') and node.endswith('"'):
            return "CONST"
        elif node in ('+', '-'):
            return "CONST"
        elif node == ':mode':
            return 'MODE'
        elif node.startswith(':'):
            return "EDGE"
        elif node in ['/', '(', ')']:
            return node
        elif node[0].isalpha():
            for char in (',', ':', '/', '(', ')', '.', '!', '?', '\\'):
                if char in node:
                    return "CONST"
            return "INST"
        else:
            return 'CONST'

    def _fix_and_make_graph(self, nodes):

        nodes_ = []
        for n in nodes:
            if isinstance(n, str):
                if n.startswith('<') and n.endswith('>') and (not n.startswith('<pointer:')):
                    pass
                else:
                    nodes_.append(n)
            else:
                nodes_.append(n)
        nodes = nodes_
        if not nodes:
            return penman.Graph()

        if self.use_pointer_tokens:

            i = 0
            nodes_ = []
            while i < len(nodes):
                nxt = nodes[i]
                pst = None
                if isinstance(nxt, str) and nxt.startswith('<pointer:'):
                    e = nxt.find('>')
                    if e != len(nxt) - 1:
                        pst = nxt[e + 1:]
                        nxt = nxt[:e + 1]
                    nodes_.append(nxt)
                    if pst is not None:
                        nodes_.append(pst)
                else:
                    nodes_.append(nxt)
                i += 1
            nodes = nodes_

            i = 1
            nodes_ = [nodes[0]]
            while i < len(nodes):
                nxt = nodes[i]
                if isinstance(nxt, str) and nxt.startswith('<pointer:') and i + 1 < len(nodes):
                    nxt = 'z' + nxt[9:-1]
                    fol = nodes[i + 1]
                    # is not expansion
                    if isinstance(fol, str) and (fol.startswith(':') or (fol == ')')):
                        nodes_.append(nxt)
                    else:
                        if self.remove_pars:
                            nodes_.append('(')
                        else:
                            if nodes_[-1] != '(':
                                nodes_.append('(')
                                # pass
                        nodes_.append(nxt)
                        nodes_.append('/')
                else:
                    nodes_.append(nxt)
                i += 1
            nodes = nodes_

        i = 0
        nodes_ = []
        while i < (len(nodes) - 1):
            if nodes[i] == ':':
                nodes_.append(nodes[i] + nodes[i + 1])
                i += 2
                last = False
            else:
                nodes_.append(nodes[i])
                i += 1
                last = True
        if last:
            nodes_.append(nodes[-1])
        nodes = nodes_

        i = 0
        nodes_ = []
        while i < (len(nodes)):
            if i < 2:
                nodes_.append(nodes[i])
                i += 1
            elif nodes_[-2] == '/' and nodes[i] == '/':
                i += 2
            else:
                nodes_.append(nodes[i])
                i += 1
        nodes = nodes_

        i = 0
        newvars = 0
        variables = set()
        remap = {}
        nodes_ = []
        while i < (len(nodes)):

            next = nodes[i]

            if next == '/':
                last = nodes_[-1]
                if last in variables:
                    last_remap = f"z{newvars + 1000}"
                    newvars += 1
                    nodes_[-1] = last_remap
                    remap[last] = last_remap
                variables.add(last)
                nodes_.append(next)

            elif self._classify(next) == 'VAR' and next in remap and (i < len(nodes) - 1) and nodes[i + 1] != '/':
                next = remap[next]
                nodes_.append(next)

            else:
                nodes_.append(next)

            i += 1

        nodes = nodes_
        pieces_ = []
        open_cnt = 0
        closed_cnt = 0
        if nodes[0] != '(':
            pieces_.append('(')
            open_cnt += 1
        for p in nodes:
            if p == '(':
                open_cnt += 1
            elif p == ')':
                closed_cnt += 1
            pieces_.append(p)
            if open_cnt == closed_cnt:
                break
        nodes = pieces_ + [')'] * (open_cnt - closed_cnt)

        pieces = []
        for piece in nodes:
            if not pieces:
                pieces.append('(')
            else:
                piece = str(piece)
                if piece.startswith('"') or piece.startswith('"') or '"' in piece.strip('"'):
                    piece = '"' + piece.replace('"', '') + '"'

                prev = self._classify(pieces[-1])
                next = self._classify(piece)

                if next == 'CONST':
                    quote = False
                    for char in (',', ':', '/', '(', ')', '.', '!', '?', '\\', '_', '='):
                        if char in piece:
                            quote = True
                            break
                    if quote:
                        piece = '"' + piece.strip('"') + '"'

                if prev == '(':
                    if next in ('VAR', 'I'):
                        pieces.append(piece)
                elif prev == ')':
                    if next in (')', 'EDGE', 'MODE'):
                        pieces.append(piece)
                elif prev == 'VAR':
                    if next in ('/', 'EDGE', 'MODE', ')'):
                        pieces.append(piece)
                elif prev == '/':
                    if next in ('INST', 'I'):
                        pieces.append(piece)
                elif prev == 'INST':
                    if next in (')', 'EDGE', 'MODE'):
                        pieces.append(piece)
                elif prev == 'I':
                    if next in ('/', ')', 'EDGE', 'MODE'):
                        pieces.append(piece)
                elif prev == 'EDGE':
                    if next in ('(', 'VAR', 'CONST', 'I'):
                        pieces.append(piece)
                    elif next == ')':
                        pieces[-1] = piece
                    elif next in ('EDGE', 'MODE'):
                        pieces[-1] = piece
                elif prev == 'MODE':
                    if next == 'INST':
                        pieces.append(piece)
                elif prev == 'CONST':
                    if next in (')', 'EDGE', 'MODE'):
                        pieces.append(piece)

        pieces_ = []
        open_cnt = 0
        closed_cnt = 0
        if pieces[0] != '(':
            pieces_.append('(')
            open_cnt += 1
        for p in pieces:
            if p == '(':
                open_cnt += 1
            elif p == ')':
                closed_cnt += 1
            pieces_.append(p)
            if open_cnt == closed_cnt:
                break
        pieces = pieces_ + [')'] * (open_cnt - closed_cnt)

        linearized = re.sub(r'\s+', ' ', ' '.join(pieces)).strip()

        """
        line = linearized
        # make sure parentheses match
        # copied from https://github.com/RikVN/AMR/blob/master/restoreAMR/restore_amr.py
        open_count = 0
        close_count = 0
        for i, c in enumerate(line):
            if c == '(':
                open_count += 1
            elif c == ')':
                close_count += 1
            if open_count == close_count and open_count > 0:
                line = line[:i].strip()
                break
        old_line = line
        while True:
            open_count = len(re.findall(r'\(', line))
            close_count = len(re.findall(r'\)', line))
            if open_count > close_count:
                line += ')' * (open_count - close_count)
            elif close_count > open_count:
                for i in range(close_count - open_count):
                    line = line.rstrip(')')
                    line = line.rstrip(' ')
            if old_line == line:
                break
            old_line = line
        """

        graph = penman.decode(linearized + ' ')
        triples = []
        newvars = 2000
        for triple in graph.triples:
            x, rel, y = triple
            if x is None:
                pass
            elif rel == ':instance' and y is None:
                triples.append(penman.Triple(x, rel, 'thing'))
            elif y is None:
                var = f'z{newvars}'
                newvars += 1
                triples.append(penman.Triple(x, rel, var))
                triples.append(penman.Triple(var, ':instance', 'thing'))
            else:
                triples.append(triple)
        graph = penman.Graph(triples)
        linearized = pm_encode(graph)

        def fix_text(linearized=linearized):
            n = 0

            def _repl1(match):
                nonlocal n
                out = match.group(1) + match.group(2) + str(3000 + n) + ' / ' + match.group(2) + match.group(3)
                n += 1
                return out

            linearized = re.sub(r'(\(\s?)([a-z])([^\/:\)]+[:\)])', _repl1, linearized,
                                flags=re.IGNORECASE | re.MULTILINE)

            def _repl2(match):
                return match.group(1)

            linearized = re.sub(r'(\(\s*[a-z][\d+]\s*\/\s*[^\s\)\(:\/]+\s*)((?:/\s*[^\s\)\(:\/]+\s*)+)', _repl2,
                                linearized,
                                flags=re.IGNORECASE | re.MULTILINE)

            # adds a ':' to args w/o it
            linearized = re.sub(r'([^:])(ARG)', r'\1 :\2', linearized)

            # removes edges with no node
            # linearized = re.sub(r':[^\s\)\(:\/]+?\s*\)', ')', linearized, flags=re.MULTILINE)

            return linearized

        linearized = fix_text(linearized)

        g = penman.decode(linearized)
        return g

    def decode_amr(self, tokens, restore_name_ops=None):
        try:
            if self.raw_graph:
                nodes = self._tokenize_encoded_graph(self.decode(tokens))
                backreferences = list(range(len(nodes)))
            else:
                nodes, backreferences = postprocessing.decode_into_node_and_backreferences_without_space(tokens, self) \
                    if not self.INIT else postprocessing.decode_into_node_and_backreferences(tokens, self)
            nodes_ = nodes
        except Exception as e:
            print('Decoding failure:', file=sys.stderr)
            traceback.print_exc()
            return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
        try:
            graph_ = graph = self._fix_and_make_graph(nodes)
            if self.collapse_name_ops:
                graph_ = graph = postprocessing._split_name_ops(graph)
        except Exception as e:
            print('Building failure:', file=sys.stderr)
            traceback.print_exc()
            print(nodes, file=sys.stderr)
            print(backreferences, file=sys.stderr)
            print(e, file=sys.stderr)
            return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
        try:
            graph, status = postprocessing.connect_graph_if_not_connected(graph)
            if status == postprocessing.ParsedStatus.BACKOFF:
                print('Reconnection 1 failure:')
                print(nodes, file=sys.stderr)
                print(backreferences, file=sys.stderr)
                print(graph_, file=sys.stderr)
            return graph, status, (nodes_, backreferences)
        except Exception as e:
            print('Reconnction 2 failure:', file=sys.stderr)
            print(e, file=sys.stderr)
            traceback.print_exc()
            print(nodes, file=sys.stderr)
            print(backreferences, file=sys.stderr)
            print(graph_, file=sys.stderr)
            return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (nodes_, backreferences)

    @property
    def encoder(self) -> Dict[str, int]:
        return self.get_vocab()


================================================
FILE: hanlp/components/amr/seq2seq/evaluation.py
================================================
from pathlib import Path

import penman


def write_predictions(predictions_path, tokenizer, graphs):
    pieces = [penman.encode(g) for g in graphs]
    text = '\n\n'.join(pieces)
    if tokenizer:
        text = text.replace(tokenizer.INIT, '')
    Path(predictions_path).write_text(text)
    return predictions_path


def compute_smatch(pred, gold):
    from perin_parser.thirdparty.mtool import smatch
    with Path(pred).open() as p, Path(gold).open() as g:
        score = next(smatch.score_amr_pairs(p, g))
    return score[2]


def compute_bleu(gold_sentences, pred_sentences):
    from sacrebleu import corpus_bleu
    return corpus_bleu(pred_sentences, [gold_sentences])


================================================
FILE: hanlp/components/amr/seq2seq/optim.py
================================================
# taken from

import math
import torch
from torch.optim.optimizer import Optimizer


class RAdam(Optimizer):

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))

        self.degenerated_to_sgd = degenerated_to_sgd
        if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
            for param in params:
                if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
                    param['buffer'] = [[None, None, None] for _ in range(10)]
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
                        buffer=[[None, None, None] for _ in range(10)])
        super(RAdam, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(RAdam, self).__setstate__(state)

    def step(self, closure=None):

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()
                if grad.is_sparse:
                    raise RuntimeError('RAdam does not support sparse gradients')

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)

                state['step'] += 1
                buffered = group['buffer'][int(state['step'] % 10)]
                if state['step'] == buffered[0]:
                    N_sma, step_size = buffered[1], buffered[2]
                else:
                    buffered[0] = state['step']
                    beta2_t = beta2 ** state['step']
                    N_sma_max = 2 / (1 - beta2) - 1
                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
                    buffered[1] = N_sma

                    # more conservative since it's an approximated value
                    if N_sma >= 5:
                        step_size = math.sqrt(
                            (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
                                    N_sma_max - 2)) / (1 - beta1 ** state['step'])
                    elif self.degenerated_to_sgd:
                        step_size = 1.0 / (1 - beta1 ** state['step'])
                    else:
                        step_size = -1
                    buffered[2] = step_size

                # more conservative since it's an approximated value
                if N_sma >= 5:
                    if group['weight_decay'] != 0:
                        p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr'])
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                    p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size * group['lr'])
                    p.data.copy_(p_data_fp32)
                elif step_size > 0:
                    if group['weight_decay'] != 0:
                        p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr'])
                    p_data_fp32.add_(exp_avg, alpha=-step_size * group['lr'])
                    p.data.copy_(p_data_fp32)

        return loss


================================================
FILE: hanlp/components/amr/seq2seq/seq2seq_amr_parser.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-04-28 17:33
import datetime
import functools
import logging
import os
from typing import Union, List, Callable

import torch
from torch.utils.data import DataLoader
from transformers import get_constant_schedule_with_warmup, T5ForConditionalGeneration
from transformers.models.bart.modeling_bart import BartForConditionalGeneration

from hanlp.common.dataset import SamplerBuilder, SortingSamplerBuilder, PadSequenceDataLoader
from hanlp.common.structure import History
from hanlp.common.torch_component import TorchComponent
from hanlp.common.vocab import Vocab
from hanlp.components.amr.seq2seq.dataset.dataset import AMRDataset, dfs_linearize_tokenize
from hanlp.components.amr.seq2seq.dataset.penman import AMRGraph
from hanlp.components.amr.seq2seq.dataset.tokenization_bart import PENMANBartTokenizer
from hanlp.components.amr.seq2seq.dataset.tokenization_t5 import PENMANT5Tokenizer
from hanlp.components.amr.seq2seq.evaluation import write_predictions, compute_smatch
from hanlp.components.amr.seq2seq.optim import RAdam
from hanlp.layers.transformers.pt_imports import PretrainedConfig, AutoConfig_
from hanlp.layers.transformers.resource import get_model_mirror, get_tokenizer_mirror
from hanlp.metrics.amr.smatch_eval import smatch_eval
from hanlp.metrics.mtl import MetricDict
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.constant import IDX
from hanlp_common.util import merge_locals_kwargs, reorder


class Seq2seq_AMR_Parser(TorchComponent):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._transformer_config: PretrainedConfig = None
        self._tokenizer: PENMANBartTokenizer = None
        self.model: BartForConditionalGeneration = None

    def build_dataloader(self, data, batch_size,
                         gradient_accumulation=1,
                         shuffle=False,
                         sampler_builder: SamplerBuilder = None,
                         device=None,
                         logger: logging.Logger = None,
                         **kwargs) -> DataLoader:
        dataset = self.build_dataset(data, not shuffle)
        if self.vocabs.mutable:
            self.build_vocabs(dataset, logger)
        self.finalize_dataset(dataset, logger)
        if isinstance(data, str):
            dataset.purge_cache()
            timer = CountdownTimer(len(dataset))
            max_num_tokens = 0
            # lc = Counter()
            for each in dataset:
                max_num_tokens = max(max_num_tokens, len(each['text_token_ids']))
                # lc[len(each['text_token_ids'])] += 1
                timer.log(f'Preprocessing and caching samples (longest sequence {max_num_tokens})'
                          f'[blink][yellow]...[/yellow][/blink]')
            # print(lc.most_common())
            if self.vocabs.mutable:
                self.vocabs.lock()
                self.vocabs.summary(logger)

        if not sampler_builder:
            sampler_builder = SortingSamplerBuilder(batch_max_tokens=500)
        sampler = sampler_builder.build([len(x['text_token_ids']) for x in dataset], shuffle,
                                        gradient_accumulation if dataset.cache else 1)
        return self._create_dataloader(dataset, batch_size, device, sampler, shuffle)

    def _create_dataloader(self, dataset, batch_size, device, sampler, shuffle):
        return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler,
                                     pad=self._get_pad_dict())

    def _get_pad_dict(self):
        return {'text_token_ids': self._transformer_config.pad_token_id,
                'graph_token_ids': self._transformer_config.pad_token_id}

    def finalize_dataset(self, dataset, logger: logging.Logger = None):
        dataset.append_transform(functools.partial(dfs_linearize_tokenize, tokenizer=self._tokenizer,
                                                   remove_space='chinese' in self.config.transformer))

    def build_dataset(self, data, generate_idx):
        dataset = AMRDataset(data, generate_idx=generate_idx)
        return dataset

    def collect_additional_tokens(self, additional_tokens, dataset):
        pred_min = self.config.pred_min
        frames = dataset.get_frames()
        for token, freq in frames.items():
            if freq >= pred_min:
                additional_tokens.add(token)
        for token, freq in dataset.get_roles().items():
            additional_tokens.add(token)
        additional_tokens.update(self.config.additional_tokens)

    def build_tokenizer(self, additional_tokens) -> PENMANBartTokenizer:
        transformer = self.config.transformer
        if 't5-' in transformer:
            cls = PENMANT5Tokenizer
        elif 'bart-' in transformer:
            cls = PENMANBartTokenizer
        else:
            raise NotImplemented(f'Unsupported transformer {transformer}')
        transformer = get_tokenizer_mirror(transformer)
        self._tokenizer = cls.from_pretrained(
            transformer,
            collapse_name_ops=self.config.collapse_name_ops,
            use_pointer_tokens=self.config.use_pointer_tokens,
            raw_graph=self.config.raw_graph,
            additional_tokens=additional_tokens,
            recategorization_tokens=self.config.recategorization_tokens,
            config=self._transformer_config,
        )
        return self._tokenizer

    def build_optimizer(self, trn, lr, epochs, gradient_accumulation, warmup_steps, weight_decay, **kwargs):
        num_training_steps = len(trn) * epochs // gradient_accumulation
        if isinstance(warmup_steps, float):
            warmup_steps = int(num_training_steps * warmup_steps)
        optimizer = RAdam(
            self.model.parameters(),
            lr=lr,
            weight_decay=weight_decay)
        scheduler = get_constant_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps)
        return optimizer, scheduler

    def build_criterion(self, **kwargs):
        pass

    def build_metric(self, **kwargs):
        pass

    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
                              logger: logging.Logger, devices, ratio_width=None, dev_data=None, eval_after=None,
                              **kwargs):
        best_epoch, best_metric = 0, -1
        if isinstance(eval_after, float):
            eval_after = int(epochs * eval_after)
        timer = CountdownTimer(epochs)
        history = History()
        for epoch in range(1, epochs + 1):
            logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
            self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history, ratio_width=ratio_width,
                                **self.config)
            if epoch > eval_after:
                dev_metric = self.evaluate_dataloader(dev, criterion, logger=logger, ratio_width=ratio_width,
                                                      output=os.path.join(save_dir, 'dev.pred.txt'),
                                                      input=dev_data, use_fast=True)
            timer.update()
            report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}"
            if epoch > eval_after:
                if dev_metric > best_metric:
                    best_epoch, best_metric = epoch, dev_metric
                    self.save_weights(save_dir)
                    report += ' [red](saved)[/red]'
                else:
                    report += f' ({epoch - best_epoch})'
                # if epoch - best_epoch >= patience:
                #     report += ' early stop'
            logger.info(report)
            # if epoch - best_epoch >= patience:
            #     break
        if not best_epoch:
            self.save_weights(save_dir)
        elif best_epoch != epoch:
            self.load_weights(save_dir)
        logger.info(f"Max score of dev is {best_metric} at epoch {best_epoch}")
        logger.info(f"Average time of each epoch is {timer.elapsed_average_human}")
        logger.info(f"{timer.elapsed_human} elapsed")
        return best_metric

    def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger,
                       history: History = None, gradient_accumulation=1, ratio_percentage=None, **kwargs):
        optimizer, scheduler = optimizer
        self.model.train()
        timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation))
        total_loss = 0
        for batch in trn:
            output_dict = self.feed_batch(batch)
            loss = output_dict['loss']
            if gradient_accumulation and gradient_accumulation > 1:
                loss /= gradient_accumulation
            loss.backward()
            total_loss += loss.item()
            if history.step(gradient_accumulation):
                self._step(optimizer, scheduler)
                timer.log(self.report_metrics(total_loss / (timer.current + 1)),
                          ratio_percentage=ratio_percentage, logger=logger)
            del loss
            del output_dict
        return total_loss / max(timer.total, 1)

    def _step(self, optimizer, scheduler):
        if self.config.grad_norm:
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm)
        optimizer.step()
        if scheduler:
            scheduler.step()
        optimizer.zero_grad()

    def report_metrics(self, loss):
        return f'loss: {loss:.4f}'

    def feed_batch(self, batch):
        input_ids, labels = batch['text_token_ids'], batch.get('graph_token_ids')
        attention_mask = input_ids.ne(self.model.config.pad_token_id).to(torch.long)
        if labels is not None:
            decoder_input_ids = labels[:, :-1]
            labels = labels[:, 1:].contiguous()
        else:
            decoder_input_ids = None
        return self.model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids,
                          labels=labels)

    @torch.no_grad()
    def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, ratio_width=None,
                            logger=None, input=None, use_fast=False,
                            **kwargs):
        self.model.eval()
        timer = CountdownTimer(len(data))
        graphs = []
        orders = []
        smatch = 0
        for idx, batch in enumerate(data):
            graphs_per_batch = self.predict_amrs(batch)
            graphs_per_batch = [x[0] for x in graphs_per_batch]
            # Copy meta data from gold graph
            for gp, gg in zip(graphs_per_batch, batch['amr']):
                metadata = gg.metadata.copy()
                metadata['annotator'] = f'{self.config.transformer}-amr'
                metadata['date'] = str(datetime.datetime.now())
                if 'save-date' in metadata:
                    del metadata['save-date']
                gp.metadata = metadata
            graphs.extend(graphs_per_batch)
            orders.extend(batch[IDX])
            if idx == timer.total - 1:
                graphs = reorder(graphs, orders)
                write_predictions(output, self._tokenizer, graphs)
                try:
                    if use_fast:
                        smatch = compute_smatch(output, input)
                    else:
                        smatch = smatch_eval(output, input, use_fast=False)
                except:
                    pass
                timer.log(smatch.cstr() if isinstance(smatch, MetricDict) else f'{smatch:.2%}', ratio_percentage=False,
                          logger=logger)
            else:
                timer.log(ratio_percentage=False, logger=logger)

        return smatch

    def predict_amrs(self, batch, beam_size=1):
        out = self._model_generate(batch, beam_size)
        tokens = []
        for i1 in range(0, out.size(0), beam_size):
            tokens_same_source = []
            tokens.append(tokens_same_source)
            for i2 in range(i1, i1 + beam_size):
                tokk = out[i2].tolist()
                tokens_same_source.append(tokk)
        tokens = [t for tt in tokens for t in tt]
        graphs = []
        tokenizer = self._tokenizer
        for i1 in range(0, len(tokens), beam_size):
            graphs_same_source = []
            graphs.append(graphs_same_source)
            for i2 in range(i1, i1 + beam_size):
                tokk = tokens[i2]
                graph, status, (lin, backr) = tokenizer.decode_amr(tokk, restore_name_ops=False)
                graph.status = status
                graph.nodes = lin
                graph.backreferences = backr
                graph.tokens = tokk
                graphs_same_source.append(graph)
            graphs_same_source[:] = \
                tuple(zip(*sorted(enumerate(graphs_same_source), key=lambda x: (x[1].status.value, x[0]))))[1]

        return graphs

    def _model_generate(self, batch, beam_size):
        input_ids = batch['text_token_ids']
        attention_mask = input_ids.ne(self.model.config.pad_token_id).to(torch.long)
        out = self.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=1024,
            decoder_start_token_id=0,
            num_beams=beam_size,
            num_return_sequences=beam_size)
        return out

    def build_model(self, training=True, **kwargs) -> torch.nn.Module:
        # noinspection PyTypeChecker
        transformer = self.config.transformer
        cls = self._get_model_cls(transformer)
        transformer = get_model_mirror(self.config.transformer)
        model: cls = cls.from_pretrained(
            transformer,
            config=self._transformer_config) if training else cls(self._transformer_config)
        if not training:
            self.build_tokenizer(self.vocabs['additional_tokens'])
        tokenizer = self._tokenizer
        model.resize_token_embeddings(len(tokenizer.encoder))
        if training:
            self._init_new_embeddings(model if cls == T5ForConditionalGeneration else model.model, tokenizer)
        return model

    def _get_model_cls(self, transformer: str):
        if 't5-' in transformer:
            cls = T5ForConditionalGeneration
        elif 'bart-' in transformer:
            cls = BartForConditionalGeneration
        else:
            raise NotImplemented(f'Unsupported transformer {transformer}')
        return cls

    @staticmethod
    def _init_new_embeddings(model, tokenizer):
        modified = 0
        encoder = tokenizer.encoder
        for tok, idx in encoder.items():
            tok = tok.lstrip(tokenizer.INIT)

            if idx < tokenizer.old_enc_size:
                continue

            elif tok.startswith('<pointer:') and tok.endswith('>'):
                tok_split = ['pointer', str(tok.split(':')[1].strip('>'))]

            elif tok.startswith('<'):
                continue

            elif tok.startswith(':'):
                if tok.startswith(':op'):
                    tok_split = ['relation', 'operator', str(int(tok[3:]))]

                elif tok.startswith(':snt'):
                    tok_split = ['relation', 'sentence', str(int(tok[4:]))]

                elif tok.startswith(':ARG'):
                    tok_split = ['relation', 'argument', str(int(tok[4:]))]
                else:
                    tok_split = ['relation'] + tok.lstrip(':').split('-')
            else:
                tok_split = tok.split('-')

            tok_split_ = tok_split
            tok_split = []
            for s in tok_split_:
                s_ = s + tokenizer.INIT
                if s_ in encoder:
                    tok_split.append(s_)
                else:
                    tok_split.extend(tokenizer._tok_bpe(s))

            vecs = []
            for s in tok_split:
                idx_split = encoder.get(s, -1)
                if idx_split > -1:
                    vec_split = model.encoder.embed_tokens.weight.data[idx_split].clone()
                    vecs.append(vec_split)

            if vecs:
                vec = torch.stack(vecs, 0).mean(0)
                noise = torch.empty_like(vec)
                noise.uniform_(-0.1, +0.1)
                model.encoder.embed_tokens.weight.data[idx] = vec + noise
                modified += 1

    def input_is_flat(self, data):
        return isinstance(data, str)

    def predict(self, data: Union[str, List[str]], beautiful_amr_graph=True, **kwargs):
        flat = self.input_is_flat(data)
        if flat:
            data = [data]
        dataloader = self.build_dataloader([{'text': x} for x in data], **self.config, device=self.device)
        orders = []
        results = []
        for batch in dataloader:
            graphs = self.predict_amrs(batch)
            graphs = [x[0] for x in graphs]
            if beautiful_amr_graph:
                graphs = [AMRGraph(x.triples, x.top, x.epidata, x.metadata) for x in graphs]
            results.extend(graphs)
            orders.extend(batch[IDX])
        results = reorder(results, orders)
        if flat:
            results = results[0]
        return results

    def fit(self, trn_data, dev_data, save_dir, batch_size=32, epochs=30,
            transformer='facebook/bart-base',
            lr=5e-05,
            grad_norm=2.5,
            weight_decay=0.004,
            warmup_steps=1,
            dropout=0.25,
            attention_dropout=0.0,
            pred_min=5,
            eval_after=0.5,
            collapse_name_ops=False,
            use_pointer_tokens=True,
            raw_graph=False,
            gradient_accumulation=1,
            recategorization_tokens=(
                    'PERSON', 'COUNTRY', 'QUANTITY', 'ORGANIZATION', 'DATE_ATTRS', 'NATIONALITY', 'LOCATION', 'ENTITY',
                    'CITY',
                    'MISC', 'ORDINAL_ENTITY', 'IDEOLOGY', 'RELIGION', 'STATE_OR_PROVINCE', 'URL', 'CAUSE_OF_DEATH', 'O',
                    'TITLE', 'DATE', 'NUMBER', 'HANDLE', 'SCORE_ENTITY', 'DURATION', 'ORDINAL', 'MONEY', 'SET',
                    'CRIMINAL_CHARGE', '_1', '_2', '_3', '_4', '_2', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12',
                    '_13',
                    '_14', '_15'),
            additional_tokens=(
                    'date-entity', 'government-organization', 'temporal-quantity', 'amr-unknown', 'multi-sentence',
                    'political-party', 'monetary-quantity', 'ordinal-entity', 'religious-group', 'percentage-entity',
                    'world-region', 'url-entity', 'political-movement', 'et-cetera', 'at-least', 'mass-quantity',
                    'have-org-role-91', 'have-rel-role-91', 'include-91', 'have-concession-91', 'have-condition-91',
                    'be-located-at-91', 'rate-entity-91', 'instead-of-91', 'hyperlink-91', 'request-confirmation-91',
                    'have-purpose-91', 'be-temporally-at-91', 'regardless-91', 'have-polarity-91', 'byline-91',
                    'have-manner-91', 'have-part-91', 'have-quant-91', 'publication-91', 'be-from-91', 'have-mod-91',
                    'have-frequency-91', 'score-on-scale-91', 'have-li-91', 'be-compared-to-91', 'be-destined-for-91',
                    'course-91', 'have-subevent-91', 'street-address-91', 'have-extent-91', 'statistical-test-91',
                    'have-instrument-91', 'have-name-91', 'be-polite-91', '-00', '-01', '-02', '-03', '-04', '-05',
                    '-06',
                    '-07', '-08', '-09', '-10', '-11', '-12', '-13', '-14', '-15', '-16', '-17', '-18', '-19', '-20',
                    '-21',
                    '-22', '-23', '-24', '-25', '-26', '-27', '-28', '-29', '-20', '-31', '-32', '-33', '-34', '-35',
                    '-36',
                    '-37', '-38', '-39', '-40', '-41', '-42', '-43', '-44', '-45', '-46', '-47', '-48', '-49', '-50',
                    '-51',
                    '-52', '-53', '-54', '-55', '-56', '-57', '-58', '-59', '-60', '-61', '-62', '-63', '-64', '-65',
                    '-66',
                    '-67', '-68', '-69', '-70', '-71', '-72', '-73', '-74', '-75', '-76', '-77', '-78', '-79', '-80',
                    '-81',
                    '-82', '-83', '-84', '-85', '-86', '-87', '-88', '-89', '-90', '-91', '-92', '-93', '-94', '-95',
                    '-96',
                    '-97', '-98', '-of'),
            devices=None,
            logger=None,
            seed=None,
            finetune: Union[bool, str] = False,
            eval_trn=True,
            _device_placeholder=False,
            **kwargs):
        """

        Args:
            trn_data:
            dev_data:
            save_dir:
            batch_size:
            epochs:
            transformer:
            lr:
            grad_norm:
            weight_decay:
            warmup_steps:
            dropout:
            attention_dropout:
            pred_min:
            eval_after:
            collapse_name_ops: ``True`` to merge name ops.
            use_pointer_tokens: ``True`` to use pointer tokens to represent variables.
            raw_graph: ``True`` to use the raw graph as input and skip all pre/post-processing steps.
            gradient_accumulation:
            recategorization_tokens: Tokens used in re-categorization. They will be added to tokenizer too but do not
            put them into ``additional_tokens``.
            additional_tokens: Tokens to be added to the tokenizer vocab.
            devices:
            logger:
            seed:
            finetune:
            eval_trn:
            _device_placeholder:
            **kwargs:

        Returns:

        """
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def on_config_ready(self, **kwargs):
        super().on_config_ready(**kwargs)
        config = AutoConfig_.from_pretrained(self.config.transformer)
        config.output_past = False
        config.no_repeat_ngram_size = 0
        config.prefix = " "
        # config.output_attentions = True
        config.dropout = self.config.dropout
        config.attention_dropout = self.config.attention_dropout
        self._transformer_config = config

    def evaluate(self, tst_data, save_dir=None, logger: logging.Logger = None, batch_size=None, output=True,
                 cache=None, ret_speed=False, **kwargs):
        return super().evaluate(tst_data, save_dir, logger, batch_size, output, cache, ret_speed, **kwargs)

    def build_vocabs(self, trn: torch.utils.data.Dataset, logger: logging.Logger):
        additional_tokens = set()
        self.collect_additional_tokens(additional_tokens, trn)
        additional_tokens = sorted(additional_tokens)
        self.build_tokenizer(additional_tokens)
        self.vocabs['additional_tokens'] = Vocab(idx_to_token=list(additional_tokens))


================================================
FILE: hanlp/components/classifiers/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-11-10 13:18

================================================
FILE: hanlp/components/classifiers/fasttext_classifier.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-09-28 13:31
import os
import sys
from typing import List, Union

import fasttext
from fasttext.FastText import _FastText

import hanlp
from hanlp.common.component import Component
from hanlp.utils.io_util import get_resource, stdout_redirected
from hanlp_common.io import load_json
from hanlp_common.reflection import classpath_of
from hanlp_common.structure import SerializableDict


class FastTextClassifier(Component):

    def __init__(self) -> None:
        super().__init__()
        self._model: _FastText = None
        self.config = SerializableDict({
            'classpath': classpath_of(self),
            'hanlp_version': hanlp.__version__,
        })

    def load(self, save_dir, model_path=None, **kwargs):
        config_path = os.path.join(save_dir, 'config.json')
        if os.path.isfile(config_path):
            self.config: dict = load_json(config_path)
            model_path = self.config.get('model_path', model_path)
        else:
            model_path = model_path or save_dir
            self.config['model_path'] = model_path
        filepath = get_resource(model_path)
        with stdout_redirected(to=os.devnull, stdout=sys.stderr):
            self._model = fasttext.load_model(filepath)

    def predict(self, text: Union[str, List[str]], topk=False, prob=False, max_len=None, **kwargs):
        """
        Classify text.

        Args:
            text: A document or a list of documents.
            topk: ``True`` or ``int`` to return the top-k labels.
            prob: Return also probabilities.
            max_len: Strip long document into ``max_len`` characters for faster prediction.
            **kwargs: Not used

        Returns:
            Classification results.
        """
        num_labels = len(self._model.get_labels())
        flat = isinstance(text, str)
        if flat:
            text = [text]
        if not isinstance(topk, list):
            topk = [topk] * len(text)
        if not isinstance(prob, list):
            prob = [prob] * len(text)
        if max_len:
            text = [x[:max_len] for x in text]
        text = [x.replace('\n', ' ') for x in text]
        batch_labels, batch_probs = self._model.predict(text, k=num_labels)
        results = []
        for labels, probs, k, p in zip(batch_labels, batch_probs, topk, prob):
            labels = [self._strip_prefix(x) for x in labels]
            if k is False:
                labels = labels[0]
            elif k is True:
                pass
            elif k:
                labels = labels[:k]
            if p:
                probs = probs.tolist()
                if k is False:
                    result = labels, probs[0]
                else:
                    result = dict(zip(labels, probs))
            else:
                result = labels
            results.append(result)
        if flat:
            results = results[0]
        return results

    @property
    def labels(self):
        return [self._strip_prefix(x) for x in self._model.get_labels()]

    @staticmethod
    def _strip_prefix(label: str):
        return label[len('__label__'):]


================================================
FILE: hanlp/components/classifiers/transformer_classifier.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-08 16:31
import logging
from abc import ABC
from typing import Callable, Union
from typing import List

import torch
from torch import nn
from torch.utils.data import DataLoader

from hanlp_common.constant import IDX
from hanlp.common.dataset import TableDataset, SortingSampler, PadSequenceDataLoader, TransformableDataset
from hanlp.common.torch_component import TorchComponent
from hanlp.common.vocab import Vocab
from hanlp.components.distillation.schedulers import LinearTeacherAnnealingScheduler
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.layers.transformers.encoder import TransformerEncoder
from hanlp.layers.transformers.pt_imports import PreTrainedModel, AutoTokenizer, BertTokenizer, AutoTokenizer_
from hanlp.layers.transformers.utils import transformer_sliding_window, build_optimizer_scheduler_with_transformer
from hanlp.metrics.accuracy import CategoricalAccuracy
from hanlp.transform.transformer_tokenizer import TransformerTextTokenizer
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import merge_locals_kwargs, merge_dict, isdebugging


class TransformerClassificationModel(nn.Module):

    def __init__(self,
                 transformer: PreTrainedModel,
                 num_labels: int,
                 max_seq_length=512) -> None:
        super().__init__()
        self.max_seq_length = max_seq_length
        self.transformer = transformer
        self.dropout = nn.Dropout(transformer.config.hidden_dropout_prob)
        self.classifier = nn.Linear(transformer.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        seq_length = input_ids.size(-1)
        if seq_length > self.max_seq_length:
            sequence_output = transformer_sliding_window(self.transformer, input_ids,
                                                         max_pieces=self.max_seq_length, ret_cls='max')
        else:
            sequence_output = self.transformer(input_ids, attention_mask, token_type_ids)[0][:, 0, :]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        return logits


class TransformerComponent(TorchComponent, ABC):
    def __init__(self, **kwargs) -> None:
        """ The base class for transorfmer based components. If offers methods to build transformer tokenizers
        , optimizers and models.

        Args:
            **kwargs: Passed to config.
        """
        super().__init__(**kwargs)
        self.transformer_tokenizer = None

    def build_optimizer(self,
                        trn,
                        epochs,
                        lr,
                        adam_epsilon,
                        weight_decay,
                        warmup_steps,
                        transformer_lr=None,
                        teacher=None,
                        **kwargs):
        num_training_steps = len(trn) * epochs // self.config.get('gradient_accumulation', 1)
        if transformer_lr is None:
            transformer_lr = lr
        transformer = self.model.encoder.transformer
        optimizer, scheduler = build_optimizer_scheduler_with_transformer(self.model, transformer,
                                                                          lr, transformer_lr,
                                                                          num_training_steps, warmup_steps,
                                                                          weight_decay, adam_epsilon)
        if teacher:
            lambda_scheduler = LinearTeacherAnnealingScheduler(num_training_steps)
            scheduler = (scheduler, lambda_scheduler)
        return optimizer, scheduler

    def fit(self, trn_data, dev_data, save_dir,
            transformer=None,
            lr=5e-5,
            transformer_lr=None,
            adam_epsilon=1e-8,
            weight_decay=0,
            warmup_steps=0.1,
            batch_size=32,
            gradient_accumulation=1,
            grad_norm=5.0,
            transformer_grad_norm=None,
            average_subwords=False,
            scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None,
            word_dropout=None,
            hidden_dropout=None,
            max_seq_len=None,
            ret_raw_hidden_states=False,
            batch_max_tokens=None,
            epochs=3,
            logger=None,
            devices: Union[float, int, List[int]] = None,
            **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def on_config_ready(self, **kwargs):
        super().on_config_ready(**kwargs)
        if 'albert_chinese' in self.config.transformer:
            self.transformer_tokenizer = BertTokenizer.from_pretrained(self.config.transformer, use_fast=True)
        else:
            self.transformer_tokenizer = AutoTokenizer_.from_pretrained(self.config.transformer, use_fast=True)

    def build_transformer(self, training=True):
        transformer = TransformerEncoder(self.config.transformer, self.transformer_tokenizer,
                                         self.config.average_subwords,
                                         self.config.scalar_mix, self.config.word_dropout,
                                         ret_raw_hidden_states=self.config.ret_raw_hidden_states,
                                         training=training)
        transformer_layers = self.config.get('transformer_layers', None)
        if transformer_layers:
            transformer.transformer.encoder.layer = transformer.transformer.encoder.layer[:transformer_layers]
        return transformer


class TransformerClassifier(TransformerComponent):

    def __init__(self, **kwargs) -> None:
        """A classifier using transformer as encoder.

        Args:
            **kwargs: Passed to config.
        """
        super().__init__(**kwargs)
        self.model: TransformerClassificationModel = None

    def build_criterion(self, **kwargs):
        criterion = nn.CrossEntropyLoss()
        return criterion

    def build_metric(self, **kwargs):
        return CategoricalAccuracy()

    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
                              logger: logging.Logger, devices, **kwargs):
        best_epoch, best_metric = 0, -1
        timer = CountdownTimer(epochs)
        ratio_width = len(f'{len(trn)}/{len(trn)}')
        for epoch in range(1, epochs + 1):
            logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
            self.fit_dataloader(trn, criterion, optimizer, metric, logger)
            if dev:
                self.evaluate_dataloader(dev, criterion, metric, logger, ratio_width=ratio_width)
            report = f'{timer.elapsed_human}/{timer.total_time_human}'
            dev_score = metric.get_metric()
            if dev_score > best_metric:
                self.save_weights(save_dir)
                best_metric = dev_score
                report += ' [red]saved[/red]'
            timer.log(report, ratio_percentage=False, newline=True, ratio=False)

    @property
    def label_vocab(self):
        return self.vocabs[self.config.label_key]

    def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
        self.model.train()
        timer = CountdownTimer(len(trn))
        optimizer, scheduler = optimizer
        total_loss = 0
        metric.reset()
        for batch in trn:
            optimizer.zero_grad()
            logits = self.feed_batch(batch)
            target = batch['label_id']
            loss = self.compute_loss(criterion, logits, target, batch)
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
            self.update_metric(metric, logits, target)
            timer.log(f'loss: {total_loss / (timer.current + 1):.4f} acc: {metric.get_metric():.2%}',
                      ratio_percentage=None,
                      logger=logger)
            del loss
        return total_loss / timer.total

    def update_metric(self, metric, logits: torch.Tensor, target, output=None):
        metric(logits, target)
        if output:
            label_ids = logits.argmax(-1)
            return label_ids

    def compute_loss(self, criterion, logits, target, batch):
        loss = criterion(logits, target)
        return loss

    def feed_batch(self, batch) -> torch.LongTensor:
        logits = self.model(*[batch[key] for key in ['input_ids', 'attention_mask', 'token_type_ids']])
        return logits

    # noinspection PyMethodOverriding
    def evaluate_dataloader(self,
                            data: DataLoader,
                            criterion: Callable,
                            metric,
                            logger,
                            ratio_width=None,
                            filename=None,
                            output=None,
                            **kwargs):
        self.model.eval()
        timer = CountdownTimer(len(data))
        total_loss = 0
        metric.reset()
        num_samples = 0
        if output:
            output = open(output, 'w')
        for batch in data:
            logits = self.feed_batch(batch)
            target = batch['label_id']
            loss = self.compute_loss(criterion, logits, target, batch)
            total_loss += loss.item()
            label_ids = self.update_metric(metric, logits, target, output)
            if output:
                labels = [self.vocabs[self.config.label_key].idx_to_token[i] for i in label_ids.tolist()]
                for i, label in enumerate(labels):
                    # text_a text_b pred gold
                    columns = [batch[self.config.text_a_key][i]]
                    if self.config.text_b_key:
                        columns.append(batch[self.config.text_b_key][i])
                    columns.append(label)
                    columns.append(batch[self.config.label_key][i])
                    output.write('\t'.join(columns))
                    output.write('\n')
            num_samples += len(target)
            report = f'loss: {total_loss / (timer.current + 1):.4f} acc: {metric.get_metric():.2%}'
            if filename:
                report = f'{filename} {report} {num_samples / timer.elapsed:.0f} samples/sec'
            timer.log(report, ratio_percentage=None, logger=logger, ratio_width=ratio_width)
        if output:
            output.close()
        return total_loss / timer.total

    # noinspection PyMethodOverriding
    def build_model(self, transformer, training=True, **kwargs) -> torch.nn.Module:
        # config: PretrainedConfig = AutoConfig.from_pretrained(transformer)
        # config.num_labels = len(self.vocabs.label)
        # config.hidden_dropout_prob = self.config.hidden_dropout_prob
        transformer = self.build_transformer(training=training).transformer
        model = TransformerClassificationModel(transformer, len(self.vocabs.label))
        # truncated_normal_(model.classifier.weight, mean=0.02, std=0.05)
        return model

    # noinspection PyMethodOverriding
    def build_dataloader(self, data, batch_size, shuffle, device, text_a_key, text_b_key,
                         label_key,
                         logger: logging.Logger = None,
                         sorting=True,
                         **kwargs) -> DataLoader:
        if not batch_size:
            batch_size = self.config.batch_size
        dataset = self.build_dataset(data)
        dataset.append_transform(self.vocabs)
        if self.vocabs.mutable:
            if not any([text_a_key, text_b_key]):
                if len(dataset.headers) == 2:
                    self.config.text_a_key = dataset.headers[0]
                    self.config.label_key = dataset.headers[1]
                elif len(dataset.headers) >= 3:
                    self.config.text_a_key, self.config.text_b_key, self.config.label_key = dataset.headers[0], \
                                                                                            dataset.headers[1], \
                                                                                            dataset.headers[-1]
                else:
                    raise ValueError('Wrong dataset format')
                report = {'text_a_key', 'text_b_key', 'label_key'}
                report = dict((k, self.config[k]) for k in report)
                report = [f'{k}={v}' for k, v in report.items() if v]
                report = ', '.join(report)
                logger.info(f'Guess [bold][blue]{report}[/blue][/bold] according to the headers of training dataset: '
                            f'[blue]{dataset}[/blue]')
            self.build_vocabs(dataset, logger)
            dataset.purge_cache()
        # if self.config.transform:
        #     dataset.append_transform(self.config.transform)
        dataset.append_transform(TransformerTextTokenizer(tokenizer=self.transformer_tokenizer,
                                                          text_a_key=self.config.text_a_key,
                                                          text_b_key=self.config.text_b_key,
                                                          max_seq_length=self.config.max_seq_length,
                                                          truncate_long_sequences=self.config.truncate_long_sequences,
                                                          output_key=''))
        batch_sampler = None
        if sorting and not isdebugging():
            if dataset.cache and len(dataset) > 1000:
                timer = CountdownTimer(len(dataset))
                lens = []
                for idx, sample in enumerate(dataset):
                    lens.append(len(sample['input_ids']))
                    timer.log('Pre-processing and caching dataset [blink][yellow]...[/yellow][/blink]',
                              ratio_percentage=None)
            else:
                lens = [len(sample['input_ids']) for sample in dataset]
            batch_sampler = SortingSampler(lens, batch_size=batch_size, shuffle=shuffle,
                                           batch_max_tokens=self.config.batch_max_tokens)
        return PadSequenceDataLoader(dataset, batch_size, shuffle, batch_sampler=batch_sampler, device=device)

    def build_dataset(self, data) -> TransformableDataset:
        if isinstance(data, str):
            dataset = TableDataset(data, cache=True)
        elif isinstance(data, TableDataset):
            dataset = data
        elif isinstance(data, list):
            dataset = TableDataset(data)
        else:
            raise ValueError(f'Unsupported data {data}')
        return dataset

    def predict(self, data: Union[str, List[str]], batch_size: int = None, **kwargs):
        if not data:
            return []
        flat = isinstance(data, str) or isinstance(data, tuple)
        if flat:
            data = [data]
        samples = []
        for idx, d in enumerate(data):
            sample = {IDX: idx}
            if self.config.text_b_key:
                sample[self.config.text_a_key] = d[0]
                sample[self.config.text_b_key] = d[1]
            else:
                sample[self.config.text_a_key] = d
            samples.append(sample)
        dataloader = self.build_dataloader(samples,
                                           sorting=False,
                                           **merge_dict(self.config,
                                                        batch_size=batch_size,
                                                        shuffle=False,
                                                        device=self.device,
                                                        overwrite=True)
                                           )
        labels = [None] * len(data)
        vocab = self.vocabs.label
        for batch in dataloader:
            logits = self.feed_batch(batch)
            pred = logits.argmax(-1)
            pred = pred.tolist()
            for idx, tag in zip(batch[IDX], pred):
                labels[idx] = vocab.idx_to_token[tag]
        if flat:
            return labels[0]
        return labels

    def fit(self, trn_data, dev_data, save_dir,
            text_a_key=None,
            text_b_key=None,
            label_key=None,
            transformer=None,
            max_seq_len=512,
            truncate_long_sequences=True,
            # hidden_dropout_prob=0.0,
            lr=5e-5,
            transformer_lr=None,
            adam_epsilon=1e-6,
            weight_decay=0,
            warmup_steps=0.1,
            batch_size=32,
            batch_max_tokens=None,
            epochs=3,
            logger=None,
            # transform=None,
            devices: Union[float, int, List[int]] = None,
            **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def build_vocabs(self, trn, logger, **kwargs):
        self.vocabs.label = Vocab(pad_token=None, unk_token=None)
        for each in trn:
            pass
        self.vocabs.lock()
        self.vocabs.summary(logger)


================================================
FILE: hanlp/components/classifiers/transformer_classifier_hf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2023-02-17 17:54
import logging
from typing import List, Union, Callable

import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, PreTrainedTokenizer, AutoTokenizer

from hanlp.common.dataset import TableDataset, PadSequenceDataLoader, SortingSamplerBuilder
from hanlp.common.torch_component import TorchComponent
from hanlp_common.constant import IDX
from hanlp_common.util import split_dict, reorder


class TransformerClassifierHF(TorchComponent):
    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
        self._tokenizer: PreTrainedTokenizer = None

    def build_dataloader(self, data, sampler_builder=None, shuffle=False, device=None,
                         logger: logging.Logger = None,
                         **kwargs) -> DataLoader:
        dataset = TableDataset(data)
        lens = [len(sample['input_ids']) for sample in dataset]
        if sampler_builder:
            sampler = sampler_builder.build(lens, shuffle, 1)
        else:
            sampler = SortingSamplerBuilder(batch_size=32).build(lens, shuffle, 1)
        loader = PadSequenceDataLoader(dataset=dataset,
                                       batch_sampler=sampler,
                                       pad={'input_ids': self._tokenizer.pad_token_id},
                                       device=device,
                                       vocabs=self.vocabs)
        return loader

    def build_optimizer(self, **kwargs):
        raise NotImplementedError()

    def build_criterion(self, **kwargs):
        raise NotImplementedError()

    def build_metric(self, **kwargs):
        raise NotImplementedError()

    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
                              logger: logging.Logger, devices, ratio_width=None, **kwargs):
        raise NotImplementedError()

    def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
        raise NotImplementedError()

    def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs):
        raise NotImplementedError()

    def load_vocabs(self, save_dir, filename='vocabs.json'):
        self._tokenizer = AutoTokenizer.from_pretrained(save_dir)

    def load_weights(self, save_dir, filename='model.pt', **kwargs):
        pass

    def build_model(self, training=True, save_dir=None, **kwargs) -> torch.nn.Module:
        return AutoModelForSequenceClassification.from_pretrained(save_dir)

    def predict(self, text: Union[str, List[str]], topk=False, prob=False, **kwargs):
        """
        Classify text.

        Args:
            text: A document or a list of documents.
            topk: ``True`` or ``int`` to return the top-k labels.
            prob: Return also probabilities.
            max_len: Strip long document into ``max_len`` characters for faster prediction.
            **kwargs: Not used

        Returns:
            Classification results.
        """
        flat = isinstance(text, str)
        if flat:
            text = [text]
        if not isinstance(topk, list):
            topk = [topk] * len(text)
        if not isinstance(prob, list):
            prob = [prob] * len(text)
        # noinspection PyTypeChecker
        dataloader = self.build_dataloader(
            split_dict(self._tokenizer(text, max_length=self.model.config.max_position_embeddings, truncation=True,
                                       return_token_type_ids=False, return_attention_mask=False)),
            device=self.device)
        results = []
        order = []
        id2label = self.model.config.id2label
        for batch in dataloader:
            logits = self.model(input_ids=batch['input_ids']).logits
            logits, batch_labels = logits.sort(descending=True)
            batch_labels = [[id2label[l] for l in ls] for ls in batch_labels.tolist()]
            batch_probs = logits.softmax(dim=-1).tolist()
            for labels, probs, i in zip(batch_labels, batch_probs, batch[IDX]):
                k = topk[i]
                p = prob[i]
                if k is False:
                    labels = labels[0]
                elif k is True:
                    pass
                elif k:
                    labels = labels[:k]
                if p:
                    if k is False:
                        result = labels, probs[0]
                    else:
                        result = dict(zip(labels, probs))
                else:
                    result = labels
                results.append(result)
            order.extend(batch[IDX])
        results = reorder(results, order)
        if flat:
            results = results[0]
        return results

    @property
    def labels(self):
        return [x[1] for x in sorted(self.model.config.id2label.items())]


================================================
FILE: hanlp/components/classifiers/transformer_classifier_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-11-10 13:19

import math
from typing import Union, Tuple, Any, Iterable
import tensorflow as tf
from hanlp.common.keras_component import KerasComponent
from hanlp_common.structure import SerializableDict
from hanlp.layers.transformers.loader_tf import build_transformer
from hanlp.optimizers.adamw import create_optimizer
from hanlp.transform.table_tf import TableTransform
from hanlp.utils.log_util import logger
from hanlp_common.util import merge_locals_kwargs
from transformers.tokenization_utils import PreTrainedTokenizer

class TransformerTextTransform(TableTransform):

    def __init__(self, config: SerializableDict = None, map_x=False, map_y=True, x_columns=None,
                 y_column=-1, skip_header=True, delimiter='auto', multi_label=False, **kwargs) -> None:
        super().__init__(config, map_x, map_y, x_columns, y_column, multi_label, skip_header, delimiter, **kwargs)
        self.tokenizer: PreTrainedTokenizer = None

    def inputs_to_samples(self, inputs, gold=False):
        tokenizer = self.tokenizer
        max_length = self.config.max_length
        num_features = None
        pad_token = None if self.label_vocab.mutable else tokenizer.convert_tokens_to_ids(['[PAD]'])[0]
        for (X, Y) in super().inputs_to_samples(inputs, gold):
            if self.label_vocab.mutable:
                yield None, Y
                continue
            if isinstance(X, str):
                X = (X,)
            if num_features is None:
                num_features = self.config.num_features
            assert num_features == len(X), f'Numbers of features {num_features} ' \
                                           f'inconsistent with current {len(X)}={X}'
            text_a = X[0]
            text_b = X[1] if len(X) > 1 else None
            tokens_a = self.tokenizer.tokenize(text_a)
            tokens_b = self.tokenizer.tokenize(text_b) if text_b else None
            tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
            segment_ids = [0] * len(tokens)
            if tokens_b:
                tokens += tokens_b
                segment_ids += [1] * len(tokens_b)
            token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            attention_mask = [1] * len(token_ids)
            diff = max_length - len(token_ids)
            if diff < 0:
                # logger.warning(
                #     f'Input tokens {tokens} exceed the max sequence length of {max_length - 2}. '
                #     f'The exceeded part will be truncated and ignored. '
                #     f'You are recommended to split your long text into several sentences within '
                #     f'{max_length - 2} tokens beforehand.')
                token_ids = token_ids[:max_length]
                attention_mask = attention_mask[:max_length]
                segment_ids = segment_ids[:max_length]
            elif diff > 0:
                token_ids += [pad_token] * diff
                attention_mask += [0] * diff
                segment_ids += [0] * diff

            assert len(token_ids) == max_length, "Error with input length {} vs {}".format(len(token_ids), max_length)
            assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask),
                                                                                                max_length)
            assert len(segment_ids) == max_length, "Error with input length {} vs {}".format(len(segment_ids),
                                                                                             max_length)

            label = Y
            yield (token_ids, attention_mask, segment_ids), label

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        max_length = self.config.max_length
        types = (tf.int32, tf.int32, tf.int32), tf.string
        shapes = ([max_length], [max_length], [max_length]), [None, ] if self.config.get('multi_label', None) else []
        values = (0, 0, 0), self.label_vocab.safe_pad_token
        return types, shapes, values

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        logger.fatal('map_x should always be set to True')
        exit(1)

    def y_to_idx(self, y) -> tf.Tensor:
        if self.config.get('multi_label', None):
            # need to change index to binary vector
            mapped = tf.map_fn(fn=lambda x: tf.cast(self.label_vocab.lookup(x), tf.int32), elems=y,
                               fn_output_signature=tf.TensorSpec(dtype=tf.dtypes.int32, shape=[None, ]))
            one_hots = tf.one_hot(mapped, len(self.label_vocab))
            idx = tf.reduce_sum(one_hots, -2)
        else:
            idx = self.label_vocab.lookup(y)
        return idx

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None,
                     batch=None) -> Iterable:
        # Prediction to be Y > 0:
        if self.config.get('multi_label', None):
            preds = Y
        else:
            preds = tf.argmax(Y, axis=-1)
        for y in preds:
            yield self.label_vocab.idx_to_token[y]

    def input_is_single_sample(self, input: Any) -> bool:
        return isinstance(input, (str, tuple))


class TransformerClassifierTF(KerasComponent):

    def __init__(self, bert_text_transform=None) -> None:
        if not bert_text_transform:
            bert_text_transform = TransformerTextTransform()
        super().__init__(bert_text_transform)
        self.model: tf.keras.Model
        self.transform: TransformerTextTransform = bert_text_transform

    # noinspection PyMethodOverriding
    def fit(self, trn_data: Any, dev_data: Any, save_dir: str, transformer: str, max_length: int = 128,
            optimizer='adamw', warmup_steps_ratio=0.1, use_amp=False, batch_size=32,
            epochs=3, logger=None, verbose=1, **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def evaluate_output(self, tst_data, out, num_batches, metric):
        out.write('sentence\tpred\tgold\n')
        total, correct, score = 0, 0, 0
        for idx, batch in enumerate(tst_data):
            outputs = self.model.predict_on_batch(batch[0])
            outputs = tf.argmax(outputs, axis=1)
            for X, Y_pred, Y_gold, in zip(batch[0][0], outputs, batch[1]):
                feature = ' '.join(self.transform.tokenizer.convert_ids_to_tokens(X.numpy()))
                feature = feature.replace(' ##', '')  # fix sub-word generated by BERT tagger
                out.write('{}\t{}\t{}\n'.format(feature,
                                                self._y_id_to_str(Y_pred),
                                                self._y_id_to_str(Y_gold)))
                total += 1
                correct += int(tf.equal(Y_pred, Y_gold).numpy())
            score = correct / total
            print('\r{}/{} {}: {:.2f}'.format(idx + 1, num_batches, metric, score * 100), end='')
        print()
        return score

    def _y_id_to_str(self, Y_pred) -> str:
        return self.transform.label_vocab.idx_to_token[Y_pred.numpy()]

    def build_loss(self, loss, **kwargs):
        if loss:
            assert isinstance(loss, tf.keras.losses.loss), 'Must specify loss as an instance in tf.keras.losses'
            return loss
        elif self.config.get('multi_label', None):
            # Loss to be BinaryCrossentropy for multi-label:
            loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
        else:
            loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        return loss

    # noinspection PyMethodOverriding
    def build_optimizer(self, optimizer, use_amp, train_steps, warmup_steps, **kwargs):
        if optimizer == 'adamw':
            opt = create_optimizer(init_lr=5e-5, num_train_steps=train_steps, num_warmup_steps=warmup_steps)
            # opt = tfa.optimizers.AdamW(learning_rate=3e-5, epsilon=1e-08, weight_decay=0.01)
            # opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
            self.config.optimizer = tf.keras.utils.serialize_keras_object(opt)
            lr_config = self.config.optimizer['config']['learning_rate']['config']
            if hasattr(lr_config['decay_schedule_fn'], 'get_config'):
                lr_config['decay_schedule_fn'] = dict(
                    (k, v) for k, v in lr_config['decay_schedule_fn'].config().items() if not k.startswith('_'))
        else:
            opt = super().build_optimizer(optimizer)
        if use_amp:
            # loss scaling is currently required when using mixed precision
            opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
        return opt

    # noinspection PyMethodOverriding
    def build_model(self, transformer, max_length, **kwargs):
        model, self.transform.tokenizer = build_transformer(transformer, max_length, len(self.transform.label_vocab),
                                                            tagging=False)
        return model

    def build_vocab(self, trn_data, logger):
        train_examples = super().build_vocab(trn_data, logger)
        warmup_steps_per_epoch = math.ceil(train_examples * self.config.warmup_steps_ratio / self.config.batch_size)
        self.config.warmup_steps = warmup_steps_per_epoch * self.config.epochs
        return train_examples

    def build_metrics(self, metrics, logger, **kwargs):
        if self.config.get('multi_label', None):
            metric = tf.keras.metrics.BinaryAccuracy('binary_accuracy')
        else:
            metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
        return [metric]


================================================
FILE: hanlp/components/classifiers/transformer_regression_hf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2023-02-17 17:54
import logging
from typing import List, Union, Callable

import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, PreTrainedTokenizer, AutoTokenizer

from hanlp.common.dataset import TableDataset, PadSequenceDataLoader, SortingSamplerBuilder
from hanlp.common.torch_component import TorchComponent
from hanlp_common.constant import IDX
from hanlp_common.util import split_dict, reorder


class TransformerRegressionHF(TorchComponent):
    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
        self._tokenizer: PreTrainedTokenizer = None

    def build_dataloader(self, data, sampler_builder=None, shuffle=False, device=None,
                         logger: logging.Logger = None,
                         **kwargs) -> DataLoader:
        dataset = TableDataset(data)
        lens = [len(sample['input_ids']) for sample in dataset]
        if sampler_builder:
            sampler = sampler_builder.build(lens, shuffle, 1)
        else:
            sampler = SortingSamplerBuilder(batch_size=32).build(lens, shuffle, 1)
        loader = PadSequenceDataLoader(dataset=dataset,
                                       batch_sampler=sampler,
                                       pad={'input_ids': self._tokenizer.pad_token_id},
                                       device=device,
                                       vocabs=self.vocabs)
        return loader

    def build_optimizer(self, **kwargs):
        raise NotImplementedError()

    def build_criterion(self, **kwargs):
        raise NotImplementedError()

    def build_metric(self, **kwargs):
        raise NotImplementedError()

    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
                              logger: logging.Logger, devices, ratio_width=None, **kwargs):
        raise NotImplementedError()

    def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
        raise NotImplementedError()

    def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs):
        raise NotImplementedError()

    def load_vocabs(self, save_dir, filename='vocabs.json'):
        self._tokenizer = AutoTokenizer.from_pretrained(save_dir)

    def load_weights(self, save_dir, filename='model.pt', **kwargs):
        pass

    def build_model(self, training=True, save_dir=None, **kwargs) -> torch.nn.Module:
        return AutoModelForSequenceClassification.from_pretrained(save_dir)

    def predict(self, text: Union[str, List[str]], **kwargs):
        """
        Classify text.

        Args:
            text: A document or a list of documents.
            topk: ``True`` or ``int`` to return the top-k labels.
            prob: Return also probabilities.
            max_len: Strip long document into ``max_len`` characters for faster prediction.
            **kwargs: Not used

        Returns:
            Classification results.
        """
        flat = isinstance(text, str)
        if flat:
            text = [text]
        # noinspection PyTypeChecker
        dataloader = self.build_dataloader(
            split_dict(self._tokenizer(text, max_length=self.model.config.max_position_embeddings, truncation=True,
                                       return_token_type_ids=False, return_attention_mask=False)),
            device=self.device)
        results = []
        order = []
        for batch in dataloader:
            logits = self.model(input_ids=batch['input_ids']).logits
            logits = logits.squeeze(-1).clip(-1, 1)
            logits = logits.tolist()
            results.extend(logits)
            order.extend(batch[IDX])
        results = reorder(results, order)
        if flat:
            results = results[0]
        return results


================================================
FILE: hanlp/components/distillation/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-10-17 20:29


================================================
FILE: hanlp/components/distillation/distillable_component.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-10-17 20:30
from abc import ABC
from copy import copy

import hanlp
from hanlp.common.torch_component import TorchComponent
from hanlp.components.distillation.losses import KnowledgeDistillationLoss
from hanlp.components.distillation.schedulers import TemperatureScheduler
from hanlp.utils.torch_util import cuda_devices
from hanlp_common.util import merge_locals_kwargs


class DistillableComponent(TorchComponent, ABC):

    # noinspection PyMethodMayBeStatic,PyTypeChecker
    def build_teacher(self, teacher: str, devices) -> TorchComponent:
        return hanlp.load(teacher, load_kwargs={'devices': devices})

    def distill(self,
                teacher: str,
                trn_data,
                dev_data,
                save_dir,
                batch_size=None,
                epochs=None,
                kd_criterion='kd_ce_loss',
                temperature_scheduler='flsw',
                devices=None,
                logger=None,
                seed=None,
                **kwargs):
        devices = devices or cuda_devices()
        if isinstance(kd_criterion, str):
            kd_criterion = KnowledgeDistillationLoss(kd_criterion)
        if isinstance(temperature_scheduler, str):
            temperature_scheduler = TemperatureScheduler.from_name(temperature_scheduler)
        teacher = self.build_teacher(teacher, devices=devices)
        self.vocabs = teacher.vocabs
        config = copy(teacher.config)
        batch_size = batch_size or config.get('batch_size', None)
        epochs = epochs or config.get('epochs', None)
        config.update(kwargs)
        return super().fit(**merge_locals_kwargs(locals(),
                                                 config,
                                                 excludes=('self', 'kwargs', '__class__', 'config')))

    @property
    def _savable_config(self):
        config = super(DistillableComponent, self)._savable_config
        if 'teacher' in config:
            config.teacher = config.teacher.load_path
        return config


================================================
FILE: hanlp/components/distillation/losses.py
================================================
# Adopted from https://github.com/airaria/TextBrewer
# Apache License Version 2.0

import torch
import torch.nn.functional as F

from hanlp_common.configurable import AutoConfigurable


def kd_mse_loss(logits_S, logits_T, temperature=1):
    '''
    Calculate the mse loss between logits_S and logits_T

    :param logits_S: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels)
    :param logits_T: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels)
    :param temperature: A float or a tensor of shape (batch_size, length) or (batch_size,)
    '''
    if isinstance(temperature, torch.Tensor) and temperature.dim() > 0:
        temperature = temperature.unsqueeze(-1)
    beta_logits_T = logits_T / temperature
    beta_logits_S = logits_S / temperature
    loss = F.mse_loss(beta_logits_S, beta_logits_T)
    return loss


def kd_ce_loss(logits_S, logits_T, temperature=1):
    '''
    Calculate the cross entropy between logits_S and logits_T

    :param logits_S: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels)
    :param logits_T: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels)
    :param temperature: A float or a tensor of shape (batch_size, length) or (batch_size,)
    '''
    if isinstance(temperature, torch.Tensor) and temperature.dim() > 0:
        temperature = temperature.unsqueeze(-1)
    beta_logits_T = logits_T / temperature
    beta_logits_S = logits_S / temperature
    p_T = F.softmax(beta_logits_T, dim=-1)
    loss = -(p_T * F.log_softmax(beta_logits_S, dim=-1)).sum(dim=-1).mean()
    return loss


def att_mse_loss(attention_S, attention_T, mask=None):
    '''
    * Calculates the mse loss between `attention_S` and `attention_T`.
    * If the `inputs_mask` is given, masks the positions where ``input_mask==0``.

    :param torch.Tensor logits_S: tensor of shape  (*batch_size*, *num_heads*, *length*, *length*)
    :param torch.Tensor logits_T: tensor of shape  (*batch_size*, *num_heads*, *length*, *length*)
    :param torch.Tensor mask: tensor of shape  (*batch_size*, *length*)
    '''
    if mask is None:
        attention_S_select = torch.where(attention_S <= -1e-3, torch.zeros_like(attention_S), attention_S)
        attention_T_select = torch.where(attention_T <= -1e-3, torch.zeros_like(attention_T), attention_T)
        loss = F.mse_loss(attention_S_select, attention_T_select)
    else:
        mask = mask.to(attention_S).unsqueeze(1).expand(-1, attention_S.size(1), -1)  # (bs, num_of_heads, len)
        valid_count = torch.pow(mask.sum(dim=2), 2).sum()
        loss = (F.mse_loss(attention_S, attention_T, reduction='none') * mask.unsqueeze(-1) * mask.unsqueeze(
            2)).sum() / valid_count
    return loss


def att_mse_sum_loss(attention_S, attention_T, mask=None):
    '''
    * Calculates the mse loss between `attention_S` and `attention_T`. 
    * If the the shape is (*batch_size*, *num_heads*, *length*, *length*), sums along the `num_heads` dimension and then calcuates the mse loss between the two matrices.
    * If the `inputs_mask` is given, masks the positions where ``input_mask==0``.

    :param torch.Tensor logits_S: tensor of shape  (*batch_size*, *num_heads*, *length*, *length*) or (*batch_size*, *length*, *length*)
    :param torch.Tensor logits_T: tensor of shape  (*batch_size*, *num_heads*, *length*, *length*) or (*batch_size*, *length*, *length*)
    :param torch.Tensor mask:     tensor of shape  (*batch_size*, *length*)
    '''
    if len(attention_S.size()) == 4:
        attention_T = attention_T.sum(dim=1)
        attention_S = attention_S.sum(dim=1)
    if mask is None:
        attention_S_select = torch.where(attention_S <= -1e-3, torch.zeros_like(attention_S), attention_S)
        attention_T_select = torch.where(attention_T <= -1e-3, torch.zeros_like(attention_T), attention_T)
        loss = F.mse_loss(attention_S_select, attention_T_select)
    else:
        mask = mask.to(attention_S)
        valid_count = torch.pow(mask.sum(dim=1), 2).sum()
        loss = (F.mse_loss(attention_S, attention_T, reduction='none') * mask.unsqueeze(-1) * mask.unsqueeze(
            1)).sum() / valid_count
    return loss


def att_ce_loss(attention_S, attention_T, mask=None):
    '''

    * Calculates the cross-entropy loss between `attention_S` and `attention_T`, where softmax is to applied on ``dim=-1``.
    * If the `inputs_mask` is given, masks the positions where ``input_mask==0``.
    
    :param torch.Tensor logits_S: tensor of shape  (*batch_size*, *num_heads*, *length*, *length*)
    :param torch.Tensor logits_T: tensor of shape  (*batch_size*, *num_heads*, *length*, *length*)
    :param torch.Tensor mask:     tensor of shape  (*batch_size*, *length*)
    '''
    probs_T = F.softmax(attention_T, dim=-1)
    if mask is None:
        probs_T_select = torch.where(attention_T <= -1e-3, torch.zeros_like(attention_T), probs_T)
        loss = -((probs_T_select * F.log_softmax(attention_S, dim=-1)).sum(dim=-1)).mean()
    else:
        mask = mask.to(attention_S).unsqueeze(1).expand(-1, attention_S.size(1), -1)  # (bs, num_of_heads, len)
        loss = -((probs_T * F.log_softmax(attention_S, dim=-1) * mask.unsqueeze(2)).sum(
            dim=-1) * mask).sum() / mask.sum()
    return loss


def att_ce_mean_loss(attention_S, attention_T, mask=None):
    '''
    * Calculates the cross-entropy loss between `attention_S` and `attention_T`, where softmax is to applied on ``dim=-1``.
    * If the shape is (*batch_size*, *num_heads*, *length*, *length*), averages over dimension `num_heads` and then computes cross-entropy loss between the two matrics.
    * If the `inputs_mask` is given, masks the positions where ``input_mask==0``.
    
    :param torch.tensor logits_S: tensor of shape  (*batch_size*, *num_heads*, *length*, *length*) or (*batch_size*, *length*, *length*)
    :param torch.tensor logits_T: tensor of shape  (*batch_size*, *num_heads*, *length*, *length*) or (*batch_size*, *length*, *length*)
    :param torch.tensor mask:     tensor of shape  (*batch_size*, *length*)
    '''
    if len(attention_S.size()) == 4:
        attention_S = attention_S.mean(dim=1)  # (bs, len, len)
        attention_T = attention_T.mean(dim=1)
    probs_T = F.softmax(attention_T, dim=-1)
    if mask is None:
        probs_T_select = torch.where(attention_T <= -1e-3, torch.zeros_like(attention_T), probs_T)
        loss = -((probs_T_select * F.log_softmax(attention_S, dim=-1)).sum(dim=-1)).mean()
    else:
        mask = mask.to(attention_S)
        loss = -((probs_T * F.log_softmax(attention_S, dim=-1) * mask.unsqueeze(1)).sum(
            dim=-1) * mask).sum() / mask.sum()
    return loss


def hid_mse_loss(state_S, state_T, mask=None):
    '''
    * Calculates the mse loss between `state_S` and `state_T`, which are the hidden state of the models.
    * If the `inputs_mask` is given, masks the positions where ``input_mask==0``.
    * If the hidden sizes of student and teacher are different, 'proj' option is required in `inetermediate_matches` to match the dimensions.

    :param torch.Tensor state_S: tensor of shape  (*batch_size*, *length*, *hidden_size*)
    :param torch.Tensor state_T: tensor of shape  (*batch_size*, *length*, *hidden_size*)
    :param torch.Tensor mask:    tensor of shape  (*batch_size*, *length*)
    '''
    if mask is None:
        loss = F.mse_loss(state_S, state_T)
    else:
        mask = mask.to(state_S)
        valid_count = mask.sum() * state_S.size(-1)
        loss = (F.mse_loss(state_S, state_T, reduction='none') * mask.unsqueeze(-1)).sum() / valid_count
    return loss


def cos_loss(state_S, state_T, mask=None):
    '''
    * Computes the cosine similarity loss between the inputs. This is the loss used in DistilBERT, see `DistilBERT <https://arxiv.org/abs/1910.01108>`_
    * If the `inputs_mask` is given, masks the positions where ``input_mask==0``.
    * If the hidden sizes of student and teacher are different, 'proj' option is required in `inetermediate_matches` to match the dimensions.

    :param torch.Tensor state_S: tensor of shape  (*batch_size*, *length*, *hidden_size*)
    :param torch.Tensor state_T: tensor of shape  (*batch_size*, *length*, *hidden_size*)
    :param torch.Tensor mask:    tensor of shape  (*batch_size*, *length*)
    '''
    if mask is None:
        state_S = state_S.view(-1, state_S.size(-1))
        state_T = state_T.view(-1, state_T.size(-1))
    else:
        mask = mask.to(state_S).unsqueeze(-1).expand_as(state_S)  # (bs,len,dim)
        state_S = torch.masked_select(state_S, mask).view(-1, mask.size(-1))  # (bs * select, dim)
        state_T = torch.masked_select(state_T, mask).view(-1, mask.size(-1))  # (bs * select, dim)

    target = state_S.new(state_S.size(0)).fill_(1)
    loss = F.cosine_embedding_loss(state_S, state_T, target, reduction='mean')
    return loss


def pkd_loss(state_S, state_T, mask=None):
    '''
    * Computes normalized vector mse loss at position 0 along `length` dimension. This is the loss used in BERT-PKD, see `Patient Knowledge Distillation for BERT Model Compression <https://arxiv.org/abs/1908.09355>`_.
    * If the hidden sizes of student and teacher are different, 'proj' option is required in `inetermediate_matches` to match the dimensions.

    :param torch.Tensor state_S: tensor of shape  (*batch_size*, *length*, *hidden_size*)
    :param torch.Tensor state_T: tensor of shape  (*batch_size*, *length*, *hidden_size*)
    :param mask: not used.
    '''

    cls_T = state_T[:, 0]  # (batch_size, hidden_dim)
    cls_S = state_S[:, 0]  # (batch_size, hidden_dim)
    normed_cls_T = cls_T / torch.norm(cls_T, dim=1, keepdim=True)
    normed_cls_S = cls_S / torch.norm(cls_S, dim=1, keepdim=True)
    loss = (normed_cls_S - normed_cls_T).pow(2).sum(dim=-1).mean()
    return loss


def fsp_loss(state_S, state_T, mask=None):
    r'''
    * Takes in two lists of matrics `state_S` and `state_T`. Each list contains two matrices of the shape (*batch_size*, *length*, *hidden_size*). Computes the similarity matrix between the two matrices in `state_S` ( with the resulting shape (*batch_size*, *hidden_size*, *hidden_size*) ) and the ones in B ( with the resulting shape (*batch_size*, *hidden_size*, *hidden_size*) ), then computes the mse loss between the similarity matrices:

    .. math::

        loss = mean((S_{1}^T \cdot S_{2} - T_{1}^T \cdot T_{2})^2)

    * It is a Variant of FSP loss in `A Gift from Knowledge Distillation: Fast Optimization, Network Minimization and Transfer Learning <http://openaccess.thecvf.com/content_cvpr_2017/papers/Yim_A_Gift_From_CVPR_2017_paper.pdf>`_.
    * If the `inputs_mask` is given, masks the positions where ``input_mask==0``.
    * If the hidden sizes of student and teacher are different, 'proj' option is required in `inetermediate_matches` to match the dimensions.

    :param torch.tensor state_S: list of two tensors, each tensor is of the shape  (*batch_size*, *length*, *hidden_size*)
    :param torch.tensor state_T: list of two tensors, each tensor is of the shape  (*batch_size*, *length*, *hidden_size*)
    :param torch.tensor mask:    tensor of the shape  (*batch_size*, *length*)

    Example in `intermediate_matches`::

        intermediate_matches = [
        {'layer_T':[0,0], 'layer_S':[0,0], 'feature':'hidden','loss': 'fsp', 'weight' : 1, 'proj':['linear',384,768]},
        ...]
    '''
    if mask is None:
        state_S_0 = state_S[0]  # (batch_size , length, hidden_dim)
        state_S_1 = state_S[1]  # (batch_size,  length, hidden_dim)
        state_T_0 = state_T[0]
        state_T_1 = state_T[1]
        gram_S = torch.bmm(state_S_0.transpose(1, 2), state_S_1) / state_S_1.size(
            1)  # (batch_size, hidden_dim, hidden_dim)
        gram_T = torch.bmm(state_T_0.transpose(1, 2), state_T_1) / state_T_1.size(1)
    else:
        mask = mask.to(state_S[0]).unsqueeze(-1)
        lengths = mask.sum(dim=1, keepdim=True)
        state_S_0 = state_S[0] * mask
        state_S_1 = state_S[1] * mask
        state_T_0 = state_T[0] * mask
        state_T_1 = state_T[1] * mask
        gram_S = torch.bmm(state_S_0.transpose(1, 2), state_S_1) / lengths
        gram_T = torch.bmm(state_T_0.transpose(1, 2), state_T_1) / lengths
    loss = F.mse_loss(gram_S, gram_T)
    return loss


def mmd_loss(state_S, state_T, mask=None):
    r'''
    * Takes in two lists of matrices `state_S` and `state_T`. Each list contains 2 matrices of the shape (*batch_size*, *length*, *hidden_size*). `hidden_size` of matrices in `State_S` doesn't need to be the same as that of `state_T`. Computes the similarity matrix between the two matrices in `state_S` ( with the resulting shape (*batch_size*, *length*, *length*) ) and the ones in B ( with the resulting shape (*batch_size*, *length*, *length*) ), then computes the mse loss between the similarity matrices:
    
    .. math::

            loss = mean((S_{1} \cdot S_{2}^T - T_{1} \cdot T_{2}^T)^2)

    * It is a Variant of the NST loss in `Like What You Like: Knowledge Distill via Neuron Selectivity Transfer <https://arxiv.org/abs/1707.01219>`_
    * If the `inputs_mask` is given, masks the positions where ``input_mask==0``.

    :param torch.tensor state_S: list of two tensors, each tensor is of the shape  (*batch_size*, *length*, *hidden_size*)
    :param torch.tensor state_T: list of two tensors, each tensor is of the shape  (*batch_size*, *length*, *hidden_size*)
    :param torch.tensor mask:    tensor of the shape  (*batch_size*, *length*)

    Example in `intermediate_matches`::

        intermediate_matches = [
        {'layer_T':[0,0], 'layer_S':[0,0], 'feature':'hidden','loss': 'nst', 'weight' : 1},
        ...]
    '''
    state_S_0 = state_S[0]  # (batch_size , length, hidden_dim_S)
    state_S_1 = state_S[1]  # (batch_size , length, hidden_dim_S)
    state_T_0 = state_T[0]  # (batch_size , length, hidden_dim_T)
    state_T_1 = state_T[1]  # (batch_size , length, hidden_dim_T)
    if mask is None:
        gram_S = torch.bmm(state_S_0, state_S_1.transpose(1, 2)) / state_S_1.size(2)  # (batch_size, length, length)
        gram_T = torch.bmm(state_T_0, state_T_1.transpose(1, 2)) / state_T_1.size(2)
        loss = F.mse_loss(gram_S, gram_T)
    else:
        mask = mask.to(state_S[0])
        valid_count = torch.pow(mask.sum(dim=1), 2).sum()
        gram_S = torch.bmm(state_S_0, state_S_1.transpose(1, 2)) / state_S_1.size(1)  # (batch_size, length, length)
        gram_T = torch.bmm(state_T_0, state_T_1.transpose(1, 2)) / state_T_1.size(1)
        loss = (F.mse_loss(gram_S, gram_T, reduction='none') * mask.unsqueeze(-1) * mask.unsqueeze(
            1)).sum() / valid_count
    return loss


class KnowledgeDistillationLoss(AutoConfigurable):
    def __init__(self, name) -> None:
        super().__init__()
        self.name = name
        import sys
        thismodule = sys.modules[__name__]
        self._loss = getattr(thismodule, name)

    def __call__(self, *args, **kwargs):
        return self._loss(*args, **kwargs)


================================================
FILE: hanlp/components/distillation/schedulers.py
================================================
# Adopted from https://github.com/airaria/TextBrewer
# Apache License Version 2.0
from abc import ABC, abstractmethod

import torch

# x is between 0 and 1
from hanlp_common.configurable import AutoConfigurable


def linear_growth_weight_scheduler(x):
    return x


def linear_decay_weight_scheduler(x):
    return 1 - x


def constant_temperature_scheduler(logits_S, logits_T, base_temperature):
    '''
    Remember to detach logits_S 
    '''
    return base_temperature


def flsw_temperature_scheduler_builder(beta, gamma, eps=1e-4, *args):
    '''
    adapted from arXiv:1911.07471
    '''

    def flsw_temperature_scheduler(logits_S, logits_T, base_temperature):
        v = logits_S.detach()
        t = logits_T.detach()
        with torch.no_grad():
            v = v / (torch.norm(v, dim=-1, keepdim=True) + eps)
            t = t / (torch.norm(t, dim=-1, keepdim=True) + eps)
            w = torch.pow((1 - (v * t).sum(dim=-1)), gamma)
            tau = base_temperature + (w.mean() - w) * beta
        return tau

    return flsw_temperature_scheduler


def cwsm_temperature_scheduler_builder(beta, *args):
    '''
    adapted from arXiv:1911.07471
    '''

    def cwsm_temperature_scheduler(logits_S, logits_T, base_temperature):
        v = logits_S.detach()
        with torch.no_grad():
            v = torch.softmax(v, dim=-1)
            v_max = v.max(dim=-1)[0]
            w = 1 / (v_max + 1e-3)
            tau = base_temperature + (w.mean() - w) * beta
        return tau

    return cwsm_temperature_scheduler


class LinearTeacherAnnealingScheduler(object):
    def __init__(self, num_training_steps: int) -> None:
        super().__init__()
        self._num_training_steps = num_training_steps
        self._current_training_steps = 0

    def step(self):
        self._current_training_steps += 1

    def __float__(self):
        return self._current_training_steps / self._num_training_steps


class TemperatureScheduler(ABC, AutoConfigurable):

    def __init__(self, base_temperature) -> None:
        super().__init__()
        self.base_temperature = base_temperature

    def __call__(self, logits_S, logits_T):
        return self.forward(logits_S, logits_T)

    @abstractmethod
    def forward(self, logits_S, logits_T):
        raise NotImplementedError()

    @staticmethod
    def from_name(name):
        classes = {
            'constant': ConstantScheduler,
            'flsw': FlswScheduler,
            'cwsm': CwsmScheduler,
        }
        assert name in classes, f'Unsupported temperature scheduler {name}. Expect one from {list(classes.keys())}.'
        return classes[name]()


class FunctionalScheduler(TemperatureScheduler):

    def __init__(self, scheduler_func, base_temperature) -> None:
        super().__init__(base_temperature)
        self._scheduler_func = scheduler_func

    def forward(self, logits_S, logits_T):
        return self._scheduler_func(logits_S, logits_T, self.base_temperature)


class ConstantScheduler(TemperatureScheduler):
    def forward(self, logits_S, logits_T):
        return self.base_temperature


class FlswScheduler(FunctionalScheduler):
    def __init__(self, beta=1, gamma=1, eps=1e-4, base_temperature=8):
        super().__init__(flsw_temperature_scheduler_builder(beta, gamma, eps), base_temperature)
        self.beta = beta
        self.gamma = gamma
        self.eps = eps


class CwsmScheduler(FunctionalScheduler):
    def __init__(self, beta=1, base_temperature=8):
        super().__init__(cwsm_temperature_scheduler_builder(beta), base_temperature)
        self.beta = beta


================================================
FILE: hanlp/components/eos/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-26 20:19

================================================
FILE: hanlp/components/eos/ngram.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-26 20:19
import logging
from collections import Counter
from typing import Union, List, Callable

import torch
from torch import nn, optim
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import DataLoader

from hanlp.common.dataset import PadSequenceDataLoader
from hanlp.common.torch_component import TorchComponent
from hanlp.common.vocab import Vocab
from hanlp.datasets.eos.eos import SentenceBoundaryDetectionDataset
from hanlp.metrics.f1 import F1
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import merge_locals_kwargs


class NgramSentenceBoundaryDetectionModel(nn.Module):

    def __init__(self,
                 char_vocab_size,
                 embedding_size=128,
                 rnn_type: str = 'LSTM',
                 rnn_size=256,
                 rnn_layers=1,
                 rnn_bidirectional=False,
                 dropout=0.2,
                 **kwargs
                 ):
        super(NgramSentenceBoundaryDetectionModel, self).__init__()
        self.embed = nn.Embedding(num_embeddings=char_vocab_size,
                                  embedding_dim=embedding_size)
        rnn_type = rnn_type.lower()
        if rnn_type == 'lstm':
            self.rnn = nn.LSTM(input_size=embedding_size,
                               hidden_size=rnn_size,
                               num_layers=rnn_layers,
                               dropout=self.dropout if rnn_layers > 1 else 0.0,
                               bidirectional=rnn_bidirectional,
                               batch_first=True)
        elif rnn_type == 'gru':
            self.rnn = nn.GRU(input_size=self.embdding_size,
                              hidden_size=rnn_size,
                              num_layers=rnn_layers,
                              dropout=self.dropout if rnn_layers > 1 else 0.0,
                              bidirectional=rnn_bidirectional,
                              batch_first=True)
        else:
            raise NotImplementedError(f"'{rnn_type}' has to be one of [LSTM, GRU]")
        self.dropout = nn.Dropout(p=dropout) if dropout else None
        self.dense = nn.Linear(in_features=rnn_size * (2 if rnn_bidirectional else 1),
                               out_features=1)

    def forward(self, x: torch.Tensor):
        output = self.embed(x)
        self.rnn.flatten_parameters()
        output, _ = self.rnn(output)
        if self.dropout:
            output = self.dropout(output[:, -1, :])
        output = output.squeeze(1)
        output = self.dense(output).squeeze(-1)
        return output


class NgramSentenceBoundaryDetector(TorchComponent):

    def __init__(self, **kwargs) -> None:
        """A sentence boundary detector using ngram as features and LSTM as encoder (:cite:`Schweter:Ahmed:2019`).
        It predicts whether a punctuation marks an ``EOS``.

        .. Note::
            This component won't work on text without the punctuations defined in its config. It's always
            recommended to understand how it works before using it. The predefined punctuations can be listed by the
            following codes.

            >>> print(eos.config.eos_chars)

        Args:
            **kwargs: Passed to config.
        """
        super().__init__(**kwargs)

    def build_optimizer(self, **kwargs):
        optimizer = optim.Adam(self.model.parameters(), lr=self.config.lr)
        return optimizer

    def build_criterion(self, **kwargs):
        return BCEWithLogitsLoss()

    def build_metric(self, **kwargs):
        return F1()

    def execute_training_loop(self,
                              trn: DataLoader,
                              dev: DataLoader,
                              epochs,
                              criterion,
                              optimizer,
                              metric,
                              save_dir,
                              logger: logging.Logger,
                              devices,
                              **kwargs):
        best_epoch, best_metric = 0, -1
        timer = CountdownTimer(epochs)
        ratio_width = len(f'{len(trn)}/{len(trn)}')
        for epoch in range(1, epochs + 1):
            logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
            self.fit_dataloader(trn, criterion, optimizer, metric, logger)
            if dev:
                self.evaluate_dataloader(dev, criterion, metric, logger, ratio_width=ratio_width)
            report = f'{timer.elapsed_human}/{timer.total_time_human}'
            dev_score = metric.score
            if dev_score > best_metric:
                self.save_weights(save_dir)
                best_metric = dev_score
                report += ' [red]saved[/red]'
            timer.log(report, ratio_percentage=False, newline=True, ratio=False)

    def fit_dataloader(self,
                       trn: DataLoader,
                       criterion,
                       optimizer,
                       metric,
                       logger: logging.Logger,
                       **kwargs):
        self.model.train()
        timer = CountdownTimer(len(trn))
        total_loss = 0
        self.reset_metrics(metric)
        for batch in trn:
            optimizer.zero_grad()
            prediction = self.feed_batch(batch)
            loss = self.compute_loss(prediction, batch, criterion)
            self.update_metrics(batch, prediction, metric)
            loss.backward()
            if self.config.grad_norm:
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm)
            optimizer.step()
            total_loss += loss.item()
            timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
                      logger=logger)
            del loss
        return total_loss / timer.total

    def compute_loss(self, prediction, batch, criterion):
        loss = criterion(prediction, batch['label_id'])
        return loss

    # noinspection PyMethodOverriding
    def evaluate_dataloader(self,
                            data: DataLoader,
                            criterion: Callable,
                            metric,
                            logger,
                            ratio_width=None,
                            output=False,
                            **kwargs):
        self.model.eval()
        self.reset_metrics(metric)
        timer = CountdownTimer(len(data))
        total_loss = 0
        for batch in data:
            prediction = self.feed_batch(batch)
            self.update_metrics(batch, prediction, metric)
            loss = self.compute_loss(prediction, batch, criterion)
            total_loss += loss.item()
            timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
                      logger=logger,
                      ratio_width=ratio_width)
            del loss
        return total_loss / timer.total, metric

    def build_model(self, training=True, **kwargs) -> torch.nn.Module:
        model = NgramSentenceBoundaryDetectionModel(**self.config, char_vocab_size=len(self.vocabs.char))
        return model

    def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger, **kwargs) -> DataLoader:
        dataset = SentenceBoundaryDetectionDataset(data, **self.config, transform=[self.vocabs])
        if isinstance(data, str):
            dataset.purge_cache()
        if not self.vocabs:
            self.build_vocabs(dataset, logger)
        return PadSequenceDataLoader(dataset, batch_size=batch_size, shuffle=shuffle, device=device,
                                     pad={'label_id': .0})

    def predict(self, data: Union[str, List[str]], batch_size: int = None, strip=True, **kwargs):
        """Sentence split.

        Args:
            data: A paragraph or a list of paragraphs.
            batch_size: Number of samples per batch.
            strip: Strip out blank characters at the head and tail of each sentence.

        Returns:
            A list of sentences or a list of lists of sentences.
        """
        if not data:
            return []
        self.model.eval()
        flat = isinstance(data, str)
        if flat:
            data = [data]
        samples = []
        eos_chars = self.config.eos_chars
        window_size = self.config.window_size
        for doc_id_, corpus in enumerate(data):
            corpus = list(corpus)
            for i, c in enumerate(corpus):
                if c in eos_chars:
                    window = corpus[max(0, i - window_size): i + window_size + 1]
                    samples.append({'char': window, 'offset_': i, 'doc_id_': doc_id_})
        eos_prediction = [[] for _ in range(len(data))]
        if samples:
            dataloader = self.build_dataloader(samples, **self.config, device=self.device, shuffle=False, logger=None)
            for batch in dataloader:
                logits = self.feed_batch(batch)
                prediction = (logits > 0).tolist()
                for doc_id_, offset_, eos in zip(batch['doc_id_'], batch['offset_'], prediction):
                    if eos:
                        eos_prediction[doc_id_].append(offset_)
        outputs = []
        for corpus, output in zip(data, eos_prediction):
            sents_per_document = []
            prev_offset = 0
            for offset in output:
                offset += 1
                sents_per_document.append(corpus[prev_offset:offset])
                prev_offset = offset
            if prev_offset != len(corpus):
                sents_per_document.append(corpus[prev_offset:])
            if strip:
                sents_per_document = [x.strip() for x in sents_per_document]
            sents_per_document = [x for x in sents_per_document if x]
            outputs.append(sents_per_document)
        if flat:
            outputs = outputs[0]
        return outputs

    # noinspection PyMethodOverriding
    def fit(self,
            trn_data,
            dev_data,
            save_dir,
            epochs=5,
            append_after_sentence=None,
            eos_chars=None,
            eos_char_min_freq=200,
            eos_char_is_punct=True,
            char_min_freq=None,
            window_size=5,
            batch_size=32,
            lr=0.001,
            grad_norm=None,
            loss_reduction='sum',
            embedding_size=128,
            rnn_type: str = 'LSTM',
            rnn_size=256,
            rnn_layers=1,
            rnn_bidirectional=False,
            dropout=0.2,
            devices=None,
            logger=None,
            seed=None,
            **kwargs
            ):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def build_vocabs(self, dataset: SentenceBoundaryDetectionDataset, logger, **kwargs):
        char_min_freq = self.config.char_min_freq
        if char_min_freq:
            has_cache = dataset.cache is not None
            char_counter = Counter()
            for each in dataset:
                for c in each['char']:
                    char_counter[c] += 1
            self.vocabs.char = vocab = Vocab()
            for c, f in char_counter.items():
                if f >= char_min_freq:
                    vocab.add(c)
            if has_cache:
                dataset.purge_cache()
                for each in dataset:
                    pass
        else:
            self.vocabs.char = Vocab()
            for each in dataset:
                pass
        self.config.eos_chars = dataset.eos_chars
        self.vocabs.lock()
        self.vocabs.summary(logger)

    def reset_metrics(self, metrics):
        metrics.reset()

    def report_metrics(self, loss, metrics):
        return f'loss: {loss:.4f} {metrics}'

    def update_metrics(self, batch: dict, prediction: torch.FloatTensor, metrics):
        def nonzero_offsets(y):
            return set(y.nonzero().squeeze(-1).tolist())

        metrics(nonzero_offsets(prediction > 0), nonzero_offsets(batch['label_id']))

    def feed_batch(self, batch):
        prediction = self.model(batch['char_id'])
        return prediction


================================================
FILE: hanlp/components/lambda_wrapper.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-31 18:36
from typing import Callable, Any

from hanlp.common.component import Component
from hanlp_common.reflection import classpath_of, object_from_classpath, str_to_type


class LambdaComponent(Component):
    def __init__(self, function: Callable) -> None:
        super().__init__()
        self.config = {}
        self.function = function
        self.config['function'] = classpath_of(function)
        self.config['classpath'] = classpath_of(self)

    def predict(self, data: Any, **kwargs):
        unpack = kwargs.pop('_hanlp_unpack', None)
        if unpack:
            return self.function(*data, **kwargs)
        return self.function(data, **kwargs)

    @staticmethod
    def from_config(meta: dict, **kwargs):
        cls = str_to_type(meta['classpath'])
        function = meta['function']
        function = object_from_classpath(function)
        return cls(function)


================================================
FILE: hanlp/components/lemmatizer.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-08 18:35
from typing import List

from hanlp.common.transform import TransformList
from hanlp.components.parsers.ud.lemma_edit import gen_lemma_rule, apply_lemma_rule
from hanlp.components.taggers.transformers.transformer_tagger import TransformerTagger


def add_lemma_rules_to_sample(sample: dict):
    if 'tag' in sample and 'lemma' not in sample:
        lemma_rules = [gen_lemma_rule(word, lemma)
                       if lemma != "_" else "_"
                       for word, lemma in zip(sample['token'], sample['tag'])]
        sample['lemma'] = sample['tag'] = lemma_rules
    return sample


class TransformerLemmatizer(TransformerTagger):

    def __init__(self, **kwargs) -> None:
        """A transition based lemmatizer using transformer as encoder.

        Args:
            **kwargs: Predefined config.
        """
        super().__init__(**kwargs)

    def build_dataset(self, data, transform=None, **kwargs):
        if not isinstance(transform, list):
            transform = TransformList()
        transform.append(add_lemma_rules_to_sample)
        return super().build_dataset(data, transform, **kwargs)

    def prediction_to_human(self, pred, vocab: List[str], batch, token=None):
        if token is None:
            token = batch['token']
        rules = super().prediction_to_human(pred, vocab, batch)
        for token_per_sent, rule_per_sent in zip(token, rules):
            lemma_per_sent = [apply_lemma_rule(t, r) for t, r in zip(token_per_sent, rule_per_sent)]
            yield lemma_per_sent


================================================
FILE: hanlp/components/lm/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-01-29 21:07


================================================
FILE: hanlp/components/lm/mlm.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-01-29 21:07
import logging
import math
from typing import Callable, Union, List

import torch
from hanlp_common.constant import IDX
from hanlp_common.util import reorder
from torch.utils.data import DataLoader
from transformers import AutoModelForMaskedLM
from transformers.tokenization_utils import PreTrainedTokenizer

from hanlp.common.dataset import TransformableDataset, PadSequenceDataLoader, SortingSampler
from hanlp.common.torch_component import TorchComponent
from hanlp.layers.transformers.pt_imports import AutoTokenizer_
from hanlp.transform.transformer_tokenizer import TransformerTextTokenizer
from hanlp.utils.time_util import CountdownTimer


class MaskedLanguageModelDataset(TransformableDataset):

    def load_file(self, filepath: str):
        raise NotImplementedError()


class MaskedLanguageModel(TorchComponent):

    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
        self.tokenizer: PreTrainedTokenizer = None

    def build_dataloader(self, data, batch_size, shuffle=False, device=None, logger: logging.Logger = None,
                         verbose=False, **kwargs) -> DataLoader:
        dataset = MaskedLanguageModelDataset([{'token': x} for x in data], generate_idx=True,
                                             transform=TransformerTextTokenizer(self.tokenizer, text_a_key='token'))
        if verbose:
            verbose = CountdownTimer(len(dataset))
        lens = []
        for each in dataset:
            lens.append(len(each['token_input_ids']))
            if verbose:
                verbose.log('Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]')
        dataloader = PadSequenceDataLoader(dataset, batch_sampler=SortingSampler(lens, batch_size=batch_size),
                                           device=device)
        return dataloader

    def build_optimizer(self, **kwargs):
        raise NotImplementedError()

    def build_criterion(self, **kwargs):
        raise NotImplementedError()

    def build_metric(self, **kwargs):
        raise NotImplementedError()

    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
                              logger: logging.Logger, devices, ratio_width=None, **kwargs):
        raise NotImplementedError()

    def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
        raise NotImplementedError()

    def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs):
        raise NotImplementedError()

    def build_model(self, training=True, transformer=None, **kwargs) -> torch.nn.Module:
        return AutoModelForMaskedLM.from_pretrained(transformer)

    def input_is_flat(self, masked_sents):
        return isinstance(masked_sents, str)

    def predict(self, masked_sents: Union[str, List[str]], batch_size=32, topk=10, **kwargs):
        flat = self.input_is_flat(masked_sents)
        if flat:
            masked_sents = [masked_sents]
        dataloader = self.build_dataloader(masked_sents, **self.config, device=self.device, batch_size=batch_size)
        orders = []
        results = []
        for batch in dataloader:
            input_ids = batch['token_input_ids']
            outputs = self.model(input_ids=input_ids, attention_mask=batch['token_attention_mask'])
            mask = input_ids == self.tokenizer.mask_token_id
            if mask.any():
                num_masks = mask.sum(dim=-1).tolist()
                masked_logits = outputs.logits[mask]
                masked_logits[:, self.tokenizer.all_special_ids] = -math.inf
                probs, indices = torch.nn.functional.softmax(masked_logits, dim=-1).topk(topk)
                br = []
                for p, index in zip(probs.tolist(), indices.tolist()):
                    br.append(dict(zip(self.tokenizer.convert_ids_to_tokens(index), p)))
                offset = 0
                for n in num_masks:
                    results.append(br[offset:offset + n])
                    offset += n
            else:
                results.extend([[]] * input_ids.size(0))
            orders.extend(batch[IDX])
        results = reorder(results, orders)
        if flat:
            results = results[0]
        return results

    def load_config(self, save_dir, filename='config.json', **kwargs):
        self.config.transformer = save_dir

    def load_vocabs(self, save_dir, filename='vocabs.json'):
        self.tokenizer = AutoTokenizer_.from_pretrained(self.config.transformer)

    def load_weights(self, save_dir, filename='model.pt', **kwargs):
        pass


================================================
FILE: hanlp/components/mtl/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-20 19:54

================================================
FILE: hanlp/components/mtl/multi_task_learning.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-20 19:55
import functools
import itertools
import logging
import os
from collections import defaultdict
from copy import copy
from itertools import chain
from typing import Union, List, Callable, Dict, Optional, Any, Iterable, Tuple

import numpy as np
import torch
from hanlp_common.constant import IDX, BOS, EOS
from hanlp_common.document import Document
from hanlp_common.util import merge_locals_kwargs, topological_sort, reorder, prefix_match
from hanlp_common.visualization import markdown_table
from toposort import toposort
from torch.utils.data import DataLoader

import hanlp.utils.torch_util
from hanlp.common.dataset import PadSequenceDataLoader, PrefetchDataLoader, CachedDataLoader
from hanlp.common.structure import History
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import FieldLength, TransformList
from hanlp.components.mtl.tasks import Task
from hanlp.layers.embeddings.contextual_word_embedding import ContextualWordEmbedding, ContextualWordEmbeddingModule
from hanlp.layers.embeddings.embedding import Embedding
from hanlp.layers.transformers.utils import pick_tensor_for_each_token
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
from hanlp.utils.time_util import CountdownTimer
from hanlp.utils.torch_util import clip_grad_norm


class MultiTaskModel(torch.nn.Module):

    def __init__(self,
                 encoder: torch.nn.Module,
                 scalar_mixes: torch.nn.ModuleDict,
                 decoders: torch.nn.ModuleDict,
                 use_raw_hidden_states: dict) -> None:
        super().__init__()
        self.use_raw_hidden_states = use_raw_hidden_states
        self.encoder: ContextualWordEmbeddingModule = encoder
        self.scalar_mixes = scalar_mixes
        self.decoders = decoders


class MultiTaskDataLoader(DataLoader):

    def __init__(self, training=True, tau: float = 0.8, **dataloaders) -> None:
        # noinspection PyTypeChecker
        super().__init__(None)
        self.tau = tau
        self.training = training
        self.dataloaders: Dict[str, DataLoader] = dataloaders if dataloaders else {}
        # self.iterators = dict((k, iter(v)) for k, v in dataloaders.items())

    def __len__(self) -> int:
        if self.dataloaders:
            return sum(len(x) for x in self.dataloaders.values())
        return 0

    def __iter__(self):
        if self.training:
            sampling_weights, total_size = self.sampling_weights
            task_names = list(self.dataloaders.keys())
            iterators = dict((k, itertools.cycle(v)) for k, v in self.dataloaders.items())
            for i in range(total_size):
                task_name = np.random.choice(task_names, p=sampling_weights)
                yield task_name, next(iterators[task_name])
        else:
            for task_name, dataloader in self.dataloaders.items():
                for batch in dataloader:
                    yield task_name, batch

    @property
    def sampling_weights(self):
        sampling_weights = self.sizes
        total_size = sum(sampling_weights)
        Z = sum(pow(v, self.tau) for v in sampling_weights)
        sampling_weights = [pow(v, self.tau) / Z for v in sampling_weights]
        return sampling_weights, total_size

    @property
    def sizes(self):
        return [len(v) for v in self.dataloaders.values()]


class MultiTaskLearning(TorchComponent):

    def __init__(self, **kwargs) -> None:
        """ A multi-task learning (MTL) framework. It shares the same encoder across multiple decoders. These decoders
        can have dependencies on each other which will be properly handled during decoding. To integrate a component
        into this MTL framework, a component needs to implement the :class:`~hanlp.components.mtl.tasks.Task` interface.

        This framework mostly follows the architecture of :cite:`clark-etal-2019-bam` and :cite:`he-choi-2021-stem`, with additional scalar mix
        tricks (:cite:`kondratyuk-straka-2019-75`) allowing each task to attend to any subset of layers. We also
        experimented with knowledge distillation on single tasks, the performance gain was nonsignificant on a large
        dataset. In the near future, we have no plan to invest more efforts in distillation, since most datasets HanLP
        uses are relatively large, and our hardware is relatively powerful.

        Args:
            **kwargs: Arguments passed to config.
        """
        super().__init__(**kwargs)
        self.model: Optional[MultiTaskModel] = None
        self.tasks: Dict[str, Task] = None
        self.vocabs = None

    def build_dataloader(self,
                         data,
                         batch_size,
                         shuffle=False,
                         device=None,
                         logger: logging.Logger = None,
                         gradient_accumulation=1,
                         tau: float = 0.8,
                         prune=None,
                         prefetch=None,
                         tasks_need_custom_eval=None,
                         cache=False,
                         debug=False,
                         **kwargs) -> DataLoader:
        # This method is only called during training or evaluation but not prediction
        dataloader = MultiTaskDataLoader(training=shuffle, tau=tau)
        for i, (task_name, task) in enumerate(self.tasks.items()):
            encoder_transform, transform = self.build_transform(task)
            training = None
            if data == 'trn':
                if debug:
                    _data = task.dev
                else:
                    _data = task.trn
                training = True
            elif data == 'dev':
                _data = task.dev
                training = False
            elif data == 'tst':
                _data = task.tst
                training = False
            else:
                _data = data
            if isinstance(data, str):
                logger.info(f'[yellow]{i + 1} / {len(self.tasks)}[/yellow] Building [blue]{data}[/blue] dataset for '
                            f'[cyan]{task_name}[/cyan] ...')
            # Adjust Tokenizer according to task config
            config = copy(task.config)
            config.pop('transform', None)
            task_dataloader: DataLoader = task.build_dataloader(_data, transform, training, device, logger,
                                                                tokenizer=encoder_transform.tokenizer,
                                                                gradient_accumulation=gradient_accumulation,
                                                                cache=isinstance(data, str), **config)
            # if prune:
            #     # noinspection PyTypeChecker
            #     task_dataset: TransformDataset = task_dataloader.dataset
            #     size_before = len(task_dataset)
            #     task_dataset.prune(prune)
            #     size_after = len(task_dataset)
            #     num_pruned = size_before - size_after
            #     logger.info(f'Pruned [yellow]{num_pruned} ({num_pruned / size_before:.1%})[/yellow] '
            #                 f'samples out of {size_before}.')
            if cache and data in ('trn', 'dev'):
                task_dataloader: CachedDataLoader = CachedDataLoader(
                    task_dataloader,
                    f'{cache}/{os.getpid()}-{data}-{task_name.replace("/", "-")}-cache.pt' if isinstance(cache,
                                                                                                         str) else None
                )
            dataloader.dataloaders[task_name] = task_dataloader
        if data == 'trn':
            sampling_weights, total_size = dataloader.sampling_weights
            headings = ['task', '#batches', '%batches', '#scaled', '%scaled', '#epoch']
            matrix = []
            min_epochs = []
            for (task_name, dataset), weight in zip(dataloader.dataloaders.items(), sampling_weights):
                epochs = len(dataset) / weight / total_size
                matrix.append(
                    [f'{task_name}', len(dataset), f'{len(dataset) / total_size:.2%}', int(total_size * weight),
                     f'{weight:.2%}', f'{epochs:.2f}'])
                min_epochs.append(epochs)
            longest = int(torch.argmax(torch.tensor(min_epochs)))
            table = markdown_table(headings, matrix)
            rows = table.splitlines()
            cells = rows[longest + 2].split('|')
            cells[-2] = cells[-2].replace(f'{min_epochs[longest]:.2f}',
                                          f'[bold][red]{min_epochs[longest]:.2f}[/red][/bold]')
            rows[longest + 2] = '|'.join(cells)
            logger.info(f'[bold][yellow]{"Samples Distribution": ^{len(rows[0])}}[/yellow][/bold]')
            logger.info('\n'.join(rows))
        if prefetch and (data == 'trn' or not tasks_need_custom_eval):
            dataloader = PrefetchDataLoader(dataloader, prefetch=prefetch)

        return dataloader

    def build_transform(self, task: Task) -> Tuple[TransformerSequenceTokenizer, TransformList]:
        encoder: ContextualWordEmbedding = self.config.encoder
        encoder_transform: TransformerSequenceTokenizer = task.build_tokenizer(encoder.transform())
        length_transform = FieldLength('token', 'token_length')
        transform = TransformList(encoder_transform, length_transform)
        extra_transform = self.config.get('transform', None)
        if extra_transform:
            transform.insert(0, extra_transform)
        return encoder_transform, transform

    def build_optimizer(self,
                        trn,
                        epochs,
                        adam_epsilon,
                        weight_decay,
                        warmup_steps,
                        lr,
                        encoder_lr,
                        **kwargs):
        model = self.model_
        encoder = model.encoder
        num_training_steps = len(trn) * epochs // self.config.get('gradient_accumulation', 1)
        encoder_parameters = list(encoder.parameters())
        parameter_groups: List[Dict[str, Any]] = []

        decoders = model.decoders
        decoder_optimizers = dict()
        for k, task in self.tasks.items():
            decoder: torch.nn.Module = decoders[k]
            decoder_parameters = list(decoder.parameters())
            if task.separate_optimizer:
                decoder_optimizers[k] = task.build_optimizer(decoder=decoder, **kwargs)
            else:
                task_lr = task.lr or lr
                parameter_groups.append({"params": decoder_parameters, 'lr': task_lr})
        parameter_groups.append({"params": encoder_parameters, 'lr': encoder_lr})
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        no_decay_parameters = set()
        for n, p in model.named_parameters():
            if any(nd in n for nd in no_decay):
                no_decay_parameters.add(p)
        no_decay_by_lr = defaultdict(list)
        for group in parameter_groups:
            _lr = group['lr']
            ps = group['params']
            group['params'] = decay_parameters = []
            group['weight_decay'] = weight_decay
            for p in ps:
                if p in no_decay_parameters:
                    no_decay_by_lr[_lr].append(p)
                else:
                    decay_parameters.append(p)
        for _lr, ps in no_decay_by_lr.items():
            parameter_groups.append({"params": ps, 'lr': _lr, 'weight_decay': 0.0})
        # noinspection PyTypeChecker
        from transformers import optimization
        encoder_optimizer = optimization.AdamW(
            parameter_groups,
            lr=lr,
            weight_decay=weight_decay,
            eps=adam_epsilon,
        )
        encoder_scheduler = optimization.get_linear_schedule_with_warmup(encoder_optimizer,
                                                                         num_training_steps * warmup_steps,
                                                                         num_training_steps)
        return encoder_optimizer, encoder_scheduler, decoder_optimizers

    def build_criterion(self, **kwargs):
        return dict((k, v.build_criterion(decoder=self.model_.decoders[k], **kwargs)) for k, v in self.tasks.items())

    def build_metric(self, **kwargs):
        metrics = MetricDict()
        for key, task in self.tasks.items():
            metric = task.build_metric(**kwargs)
            assert metric, f'Please implement `build_metric` of {type(task)} to return a metric.'
            metrics[key] = metric
        return metrics

    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
                              logger: logging.Logger, devices, patience=0.5, **kwargs):
        if isinstance(patience, float):
            patience = int(patience * epochs)
        best_epoch, best_metric = 0, -1
        timer = CountdownTimer(epochs)
        ratio_width = len(f'{len(trn)}/{len(trn)}')
        epoch = 0
        history = History()
        for epoch in range(1, epochs + 1):
            logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
            self.fit_dataloader(trn, criterion, optimizer, metric, logger, history, ratio_width=ratio_width,
                                **self.config)
            if dev:
                self.evaluate_dataloader(dev, criterion, metric, logger, ratio_width=ratio_width, input='dev')
            report = f'{timer.elapsed_human}/{timer.total_time_human}'
            dev_score = metric.score
            if dev_score > best_metric:
                self.save_weights(save_dir)
                best_metric = dev_score
                best_epoch = epoch
                report += ' [red]saved[/red]'
            else:
                report += f' ({epoch - best_epoch})'
                if epoch - best_epoch >= patience:
                    report += ' early stop'
                    break
            timer.log(report, ratio_percentage=False, newline=True, ratio=False)
        for d in [trn, dev]:
            self._close_dataloader(d)
        if best_epoch != epoch:
            logger.info(f'Restoring best model saved [red]{epoch - best_epoch}[/red] epochs ago')
            self.load_weights(save_dir)
        return best_metric

    def _close_dataloader(self, d):
        if isinstance(d, PrefetchDataLoader):
            d.close()
            if hasattr(d.dataset, 'close'):
                self._close_dataloader(d.dataset)
        elif isinstance(d, CachedDataLoader):
            d.close()
        elif isinstance(d, MultiTaskDataLoader):
            for d in d.dataloaders.values():
                self._close_dataloader(d)

    # noinspection PyMethodOverriding
    def fit_dataloader(self,
                       trn: DataLoader,
                       criterion,
                       optimizer,
                       metric,
                       logger: logging.Logger,
                       history: History,
                       ratio_width=None,
                       gradient_accumulation=1,
                       encoder_grad_norm=None,
                       decoder_grad_norm=None,
                       patience=0.5,
                       eval_trn=False,
                       **kwargs):
        self.model.train()
        encoder_optimizer, encoder_scheduler, decoder_optimizers = optimizer
        timer = CountdownTimer(len(trn))
        total_loss = 0
        self.reset_metrics(metric)
        model = self.model_
        encoder_parameters = model.encoder.parameters()
        decoder_parameters = model.decoders.parameters()
        for idx, (task_name, batch) in enumerate(trn):
            decoder_optimizer = decoder_optimizers.get(task_name, None)
            output_dict, _ = self.feed_batch(batch, task_name)
            loss = self.compute_loss(batch, output_dict[task_name]['output'], criterion[task_name],
                                     self.tasks[task_name])
            if gradient_accumulation and gradient_accumulation > 1:
                loss /= gradient_accumulation
            loss.backward()
            total_loss += float(loss.item())
            if history.step(gradient_accumulation):
                if self.config.get('grad_norm', None):
                    clip_grad_norm(model, self.config.grad_norm)
                if encoder_grad_norm:
                    torch.nn.utils.clip_grad_norm_(encoder_parameters, encoder_grad_norm)
                if decoder_grad_norm:
                    torch.nn.utils.clip_grad_norm_(decoder_parameters, decoder_grad_norm)
                encoder_optimizer.step()
                encoder_optimizer.zero_grad()
                encoder_scheduler.step()
                if decoder_optimizer:
                    if isinstance(decoder_optimizer, tuple):
                        decoder_optimizer, decoder_scheduler = decoder_optimizer
                    else:
                        decoder_scheduler = None
                    decoder_optimizer.step()
                    decoder_optimizer.zero_grad()
                    if decoder_scheduler:
                        decoder_scheduler.step()
            if eval_trn:
                self.decode_output(output_dict, batch, task_name)
                self.update_metrics(batch, output_dict, metric, task_name)
            timer.log(self.report_metrics(total_loss / (timer.current + 1), metric if eval_trn else None),
                      ratio_percentage=None,
                      ratio_width=ratio_width,
                      logger=logger)
            del loss
            del output_dict
        return total_loss / timer.total

    def report_metrics(self, loss, metrics: MetricDict):
        return f'loss: {loss:.4f} {metrics.cstr()}' if metrics else f'loss: {loss:.4f}'

    # noinspection PyMethodOverriding
    @torch.no_grad()
    def evaluate_dataloader(self,
                            data: MultiTaskDataLoader,
                            criterion,
                            metric: MetricDict,
                            logger,
                            ratio_width=None,
                            input: str = None,
                            **kwargs):
        self.model.eval()
        self.reset_metrics(metric)
        tasks_need_custom_eval = self.config.get('tasks_need_custom_eval', None)
        tasks_need_custom_eval = tasks_need_custom_eval or {}
        tasks_need_custom_eval = dict((k, None) for k in tasks_need_custom_eval)
        for each in tasks_need_custom_eval:
            tasks_need_custom_eval[each] = data.dataloaders.pop(each)
        timer = CountdownTimer(len(data) + len(tasks_need_custom_eval))
        total_loss = 0
        for idx, (task_name, batch) in enumerate(data):
            output_dict, _ = self.feed_batch(batch, task_name)
            loss = self.compute_loss(batch, output_dict[task_name]['output'], criterion[task_name],
                                     self.tasks[task_name])
            total_loss += loss.item()
            self.decode_output(output_dict, batch, task_name)
            self.update_metrics(batch, output_dict, metric, task_name)
            timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
                      logger=logger,
                      ratio_width=ratio_width)
            del loss
            del output_dict

        for task_name, dataset in tasks_need_custom_eval.items():
            task = self.tasks[task_name]
            decoder = self.model_.decoders[task_name]
            task.evaluate_dataloader(
                dataset, task.build_criterion(decoder=decoder),
                metric=metric[task_name],
                input=task.dev if input == 'dev' else task.tst,
                split=input,
                decoder=decoder,
                h=functools.partial(self._encode, task_name=task_name,
                                    cls_is_bos=task.cls_is_bos, sep_is_eos=task.sep_is_eos)
            )
            data.dataloaders[task_name] = dataset
            timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
                      logger=logger,
                      ratio_width=ratio_width)

        return total_loss / timer.total, metric, data

    def build_model(self, training=False, **kwargs) -> torch.nn.Module:
        tasks = self.tasks
        encoder: ContextualWordEmbedding = self.config.encoder
        transformer_module = encoder.module(training=training)
        encoder_size = transformer_module.get_output_dim()
        scalar_mixes = torch.nn.ModuleDict()
        decoders = torch.nn.ModuleDict()
        use_raw_hidden_states = dict()
        for task_name, task in tasks.items():
            decoder = task.build_model(encoder_size, training=training, **task.config)
            assert decoder, f'Please implement `build_model` of {type(task)} to return a decoder.'
            decoders[task_name] = decoder
            if task.scalar_mix:
                scalar_mix = task.scalar_mix.build()
                scalar_mixes[task_name] = scalar_mix
                # Activate scalar mix starting from 0-th layer
                encoder.scalar_mix = 0
            use_raw_hidden_states[task_name] = task.use_raw_hidden_states
        encoder.ret_raw_hidden_states = any(use_raw_hidden_states.values())
        return MultiTaskModel(transformer_module, scalar_mixes, decoders, use_raw_hidden_states)

    def predict(self,
                data: Union[str, List[str]],
                tasks: Optional[Union[str, List[str]]] = None,
                skip_tasks: Optional[Union[str, List[str]]] = None,
                resolved_tasks=None,
                **kwargs) -> Document:
        """Predict on data.

        Args:
            data: A sentence or a list of sentences.
            tasks: The tasks to predict.
            skip_tasks: The tasks to skip.
            resolved_tasks: The resolved tasks to override ``tasks`` and ``skip_tasks``.
            **kwargs: Not used.

        Returns:
            A :class:`~hanlp_common.document.Document`.
        """
        doc = Document()
        target_tasks = resolved_tasks or self.resolve_tasks(tasks, skip_tasks)
        if data == []:
            for group in target_tasks:
                for task_name in group:
                    doc[task_name] = []
            return doc
        flatten_target_tasks = [self.tasks[t] for group in target_tasks for t in group]
        cls_is_bos = any([x.cls_is_bos for x in flatten_target_tasks])
        sep_is_eos = any([x.sep_is_eos for x in flatten_target_tasks])
        # Now build the dataloaders and execute tasks
        first_task_name: str = list(target_tasks[0])[0]
        first_task: Task = self.tasks[first_task_name]
        encoder_transform, transform = self.build_transform(first_task)
        # Override the tokenizer config of the 1st task
        encoder_transform.sep_is_eos = sep_is_eos
        encoder_transform.cls_is_bos = cls_is_bos
        average_subwords = self.model.encoder.average_subwords
        flat = first_task.input_is_flat(data)
        if flat:
            data = [data]
        device = self.device
        samples = first_task.build_samples(data, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos)
        dataloader = first_task.build_dataloader(samples, transform=transform, device=device)
        results = defaultdict(list)
        order = []
        for batch in dataloader:
            order.extend(batch[IDX])
            # Run the first task, let it make the initial batch for the successors
            output_dict = self.predict_task(first_task, first_task_name, batch, results, run_transform=True,
                                            cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos)
            # Run each task group in order
            for group_id, group in enumerate(target_tasks):
                # We could parallelize this in the future
                for task_name in group:
                    if task_name == first_task_name:
                        continue
                    output_dict = self.predict_task(self.tasks[task_name], task_name, batch, results, output_dict,
                                                    run_transform=True, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos)
                if group_id == 0:
                    # We are kind of hard coding here. If the first task is a tokenizer,
                    # we need to convert the hidden and mask to token level
                    if first_task_name.startswith('tok'):
                        spans = []
                        tokens = []
                        output_spans = first_task.config.get('output_spans', None)
                        for span_per_sent, token_per_sent in zip(output_dict[first_task_name]['prediction'],
                                                                 results[first_task_name][-len(batch[IDX]):]):
                            if output_spans:
                                token_per_sent = [x[0] for x in token_per_sent]
                            if cls_is_bos:
                                span_per_sent = [(-1, 0)] + span_per_sent
                                token_per_sent = [BOS] + token_per_sent
                            if sep_is_eos:
                                span_per_sent = span_per_sent + [(span_per_sent[-1][0] + 1, span_per_sent[-1][1] + 1)]
                                token_per_sent = token_per_sent + [EOS]
                            # The offsets start with 0 while [CLS] is zero
                            if average_subwords:
                                span_per_sent = [list(range(x[0] + 1, x[1] + 1)) for x in span_per_sent]
                            else:
                                span_per_sent = [x[0] + 1 for x in span_per_sent]
                            spans.append(span_per_sent)
                            tokens.append(token_per_sent)
                        spans = PadSequenceDataLoader.pad_data(spans, 0, torch.long, device=device)
                        output_dict['hidden'] = pick_tensor_for_each_token(output_dict['hidden'], spans,
                                                                           average_subwords)
                        batch['token_token_span'] = spans
                        batch['token'] = tokens
                        # noinspection PyTypeChecker
                        batch['token_length'] = torch.tensor([len(x) for x in tokens], dtype=torch.long, device=device)
                        batch.pop('mask', None)
        # Put results into doc in the order of tasks
        for k in self.config.task_names:
            v = results.get(k, None)
            if v is None:
                continue
            doc[k] = reorder(v, order)
        # Allow task to perform finalization on document
        for group in target_tasks:
            for task_name in group:
                task = self.tasks[task_name]
                task.finalize_document(doc, task_name)
        # If no tok in doc, use raw input as tok
        if not any(k.startswith('tok') for k in doc):
            doc['tok'] = data
        if flat:
            for k, v in list(doc.items()):
                doc[k] = v[0]
        # If there is only one field, don't bother to wrap it
        # if len(doc) == 1:
        #     return list(doc.values())[0]
        return doc

    def resolve_tasks(self, tasks, skip_tasks) -> List[Iterable[str]]:
        # Now we decide which tasks to perform and their orders
        tasks_in_topological_order = self._tasks_in_topological_order
        task_topological_order = self._task_topological_order
        computation_graph = self._computation_graph
        target_tasks = self._resolve_task_name(tasks)
        if not target_tasks:
            target_tasks = tasks_in_topological_order
        else:
            target_topological_order = defaultdict(set)
            for task_name in target_tasks:
                for dependency in topological_sort(computation_graph, task_name):
                    target_topological_order[task_topological_order[dependency]].add(dependency)
            target_tasks = [item[1] for item in sorted(target_topological_order.items())]
        if skip_tasks:
            skip_tasks = self._resolve_task_name(skip_tasks)
            target_tasks = [x - skip_tasks for x in target_tasks]
            target_tasks = [x for x in target_tasks if x]
        assert target_tasks, f'No task to perform due to `tasks = {tasks}`.'
        # Sort target tasks within the same group in a defined order
        target_tasks = [sorted(x, key=lambda _x: self.config.task_names.index(_x)) for x in target_tasks]
        return target_tasks

    def predict_task(self, task: Task, output_key, batch, results, output_dict=None, run_transform=True,
                     cls_is_bos=True, sep_is_eos=True):
        output_dict, batch = self.feed_batch(batch, output_key, output_dict, run_transform, cls_is_bos, sep_is_eos,
                                             results)
        self.decode_output(output_dict, batch, output_key)
        results[output_key].extend(task.prediction_to_result(output_dict[output_key]['prediction'], batch))
        return output_dict

    def _resolve_task_name(self, dependencies):
        resolved_dependencies = set()
        if isinstance(dependencies, str):
            if dependencies in self.tasks:
                resolved_dependencies.add(dependencies)
            elif dependencies.endswith('*'):
                resolved_dependencies.update(x for x in self.tasks if x.startswith(dependencies[:-1]))
            else:
                prefix_matched = prefix_match(dependencies, self.config.task_names)
                assert prefix_matched, f'No prefix matching for {dependencies}. ' \
                                       f'Check your dependencies definition: {list(self.tasks.values())}'
                resolved_dependencies.add(prefix_matched)
        elif isinstance(dependencies, Iterable):
            resolved_dependencies.update(set(chain.from_iterable(self._resolve_task_name(x) for x in dependencies)))
        return resolved_dependencies

    def fit(self,
            encoder: Embedding,
            tasks: Dict[str, Task],
            save_dir,
            epochs,
            patience=0.5,
            lr=1e-3,
            encoder_lr=5e-5,
            adam_epsilon=1e-8,
            weight_decay=0.0,
            warmup_steps=0.1,
            gradient_accumulation=1,
            grad_norm=5.0,
            encoder_grad_norm=None,
            decoder_grad_norm=None,
            tau: float = 0.8,
            transform=None,
            # prune: Callable = None,
            eval_trn=True,
            prefetch=None,
            tasks_need_custom_eval=None,
            _device_placeholder=False,
            cache=False,
            devices=None,
            logger=None,
            seed=None,
            **kwargs):
        trn_data, dev_data, batch_size = 'trn', 'dev', None
        task_names = list(tasks.keys())
        return super().fit(**merge_locals_kwargs(locals(), kwargs, excludes=('self', 'kwargs', '__class__', 'tasks')),
                           **tasks)

    # noinspection PyAttributeOutsideInit
    def on_config_ready(self, **kwargs):
        self.tasks = dict((key, task) for key, task in self.config.items() if isinstance(task, Task))
        computation_graph = dict()
        for task_name, task in self.tasks.items():
            dependencies = task.dependencies
            resolved_dependencies = self._resolve_task_name(dependencies)
            computation_graph[task_name] = resolved_dependencies

        # We can cache this order
        tasks_in_topological_order = list(toposort(computation_graph))
        task_topological_order = dict()
        for i, group in enumerate(tasks_in_topological_order):
            for task_name in group:
                task_topological_order[task_name] = i
        self._tasks_in_topological_order = tasks_in_topological_order
        self._task_topological_order = task_topological_order
        self._computation_graph = computation_graph

    @staticmethod
    def reset_metrics(metrics: Dict[str, Metric]):
        for metric in metrics.values():
            metric.reset()

    def feed_batch(self,
                   batch: Dict[str, Any],
                   task_name,
                   output_dict=None,
                   run_transform=False,
                   cls_is_bos=False,
                   sep_is_eos=False,
                   results=None) -> Tuple[Dict[str, Any], Dict[str, Any]]:
        h, output_dict = self._encode(batch, task_name, output_dict, cls_is_bos, sep_is_eos)
        task = self.tasks[task_name]
        if run_transform:
            batch = task.transform_batch(batch, results=results, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos)
        batch['mask'] = mask = hanlp.utils.torch_util.lengths_to_mask(batch['token_length'])
        output_dict[task_name] = {
            'output': task.feed_batch(h,
                                      batch=batch,
                                      mask=mask,
                                      decoder=self.model.decoders[task_name]),
            'mask': mask
        }
        return output_dict, batch

    def _encode(self, batch, task_name, output_dict=None, cls_is_bos=False, sep_is_eos=False):
        model = self.model
        if output_dict:
            hidden, raw_hidden = output_dict['hidden'], output_dict['raw_hidden']
        else:
            hidden = model.encoder(batch)
            if isinstance(hidden, tuple):
                hidden, raw_hidden = hidden
            else:
                raw_hidden = None
            output_dict = {'hidden': hidden, 'raw_hidden': raw_hidden}
        hidden_states = raw_hidden if model.use_raw_hidden_states[task_name] else hidden
        if task_name in model.scalar_mixes:
            scalar_mix = model.scalar_mixes[task_name]
            h = scalar_mix(hidden_states)
        else:
            if model.scalar_mixes:  # If any task enables scalar_mix, hidden_states will be a 4d tensor
                hidden_states = hidden_states[-1, :, :, :]
            h = hidden_states
        # If the task doesn't need cls while h has cls, remove cls
        task = self.tasks[task_name]
        if cls_is_bos and not task.cls_is_bos:
            h = h[:, 1:, :]
        if sep_is_eos and not task.sep_is_eos:
            h = h[:, :-1, :]
        return h, output_dict

    def decode_output(self, output_dict, batch, task_name=None):
        if not task_name:
            for task_name, task in self.tasks.items():
                output_per_task = output_dict.get(task_name, None)
                if output_per_task is not None:
                    output_per_task['prediction'] = task.decode_output(
                        output_per_task['output'],
                        output_per_task['mask'],
                        batch, self.model.decoders[task_name])
        else:
            output_per_task = output_dict[task_name]
            output_per_task['prediction'] = self.tasks[task_name].decode_output(
                output_per_task['output'],
                output_per_task['mask'],
                batch,
                self.model.decoders[task_name])

    def update_metrics(self, batch: Dict[str, Any], output_dict: Dict[str, Any], metrics: MetricDict, task_name):
        task = self.tasks[task_name]
        output_per_task = output_dict.get(task_name, None)
        if output_per_task:
            output = output_per_task['output']
            prediction = output_per_task['prediction']
            metric = metrics.get(task_name, None)
            task.update_metrics(batch, output, prediction, metric)

    def compute_loss(self,
                     batch: Dict[str, Any],
                     output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                     criterion: Callable,
                     task: Task) -> torch.FloatTensor:
        return task.compute_loss(batch, output, criterion)

    def evaluate(self, save_dir=None, logger: logging.Logger = None, batch_size=None, output=False, **kwargs):
        rets = super().evaluate('tst', save_dir, logger, batch_size, output, **kwargs)
        tst = rets[-1]
        self._close_dataloader(tst)
        return rets

    def save_vocabs(self, save_dir, filename='vocabs.json'):
        for task_name, task in self.tasks.items():
            task.save_vocabs(save_dir, f'{task_name}_{filename}')

    def load_vocabs(self, save_dir, filename='vocabs.json'):
        for task_name, task in self.tasks.items():
            task.load_vocabs(save_dir, f'{task_name}_{filename}')

    def parallelize(self, devices: List[Union[int, torch.device]]):
        raise NotImplementedError('Parallelization is not implemented yet.')

    def __call__(self, data, **kwargs) -> Document:
        return super().__call__(data, **kwargs)

    def __getitem__(self, task_name: str) -> Task:
        return self.tasks[task_name]

    def __delitem__(self, task_name: str):
        """Delete a task (and every resource it owns) from this component.

        Args:
            task_name: The name of the task to be deleted.

        Examples:
            >>> del mtl['dep']  # Delete dep from MTL

        """
        del self.config[task_name]
        self.config.task_names.remove(task_name)
        del self.tasks[task_name]
        del self.model.decoders[task_name]
        del self._computation_graph[task_name]
        self._task_topological_order.pop(task_name)
        for group in self._tasks_in_topological_order:
            group: set = group
            group.discard(task_name)

    def __repr__(self):
        return repr(self.config)

    def items(self):
        yield from self.tasks.items()

    def __setattr__(self, key: str, value):
        if key and key.startswith('dict') and not hasattr(self, key):
            please_read_the_doc_ok = f'This MTL component has no {key}.'
            matched_children = []
            for name in self.config.task_names:
                if hasattr(self[name], key):
                    matched_children.append(name)
            if matched_children:
                please_read_the_doc_ok += f' Maybe you are looking for one of its tasks: {matched_children}. ' \
                                          f'For example, HanLP["{matched_children[0]}"].{key} = ...'
            raise TypeError(please_read_the_doc_ok)
        object.__setattr__(self, key, value)


================================================
FILE: hanlp/components/mtl/tasks/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-02 16:51
import logging
import os
import warnings
from abc import ABC, abstractmethod
from copy import copy
from typing import Callable, Dict, Any, Union, Iterable, List

import torch
from hanlp_common.util import merge_locals_kwargs
from torch.utils.data import DataLoader

from hanlp_common.constant import BOS, EOS
from hanlp.common.dataset import SamplerBuilder, SortingSamplerBuilder, TransformableDataset, KMeansSamplerBuilder
from hanlp_common.document import Document
from hanlp.common.structure import ConfigTracker
from hanlp.common.torch_component import TorchComponent
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
from hanlp.utils.time_util import CountdownTimer


class Task(ConfigTracker, TorchComponent, ABC):
    # noinspection PyMissingConstructor
    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=None,
                 separate_optimizer=False,
                 cls_is_bos=False,
                 sep_is_eos=False,
                 **kwargs) -> None:
        """
        A task in the multi-task learning framework

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            cls_is_bos: ``True`` to treat the first token as ``BOS``.
            sep_is_eos: ``True`` to treat the last token as ``EOS``.
            **kwargs: Additional config.
        """
        ConfigTracker.__init__(self, merge_locals_kwargs(locals(), kwargs))
        for f, n in zip([trn, dev, tst], ['trn', 'dev', 'tst']):
            if f and os.path.isfile(f):  # anonymize local file names
                self.config.pop(n)
        self.separate_optimizer = separate_optimizer
        self.lr = lr
        self.use_raw_hidden_states = use_raw_hidden_states
        if sampler_builder is None:
            sampler_builder = SortingSamplerBuilder(batch_size=32)
        self.sampler_builder: Union[SortingSamplerBuilder, KMeansSamplerBuilder] = sampler_builder
        self.dependencies = dependencies
        self.tst = tst
        self.dev = dev
        self.trn = trn
        self.scalar_mix = scalar_mix
        self.cls_is_bos = cls_is_bos
        self.sep_is_eos = sep_is_eos

    @abstractmethod
    def build_dataloader(self,
                         data,
                         transform: Callable = None,
                         training=False,
                         device=None,
                         logger: logging.Logger = None,
                         cache=False,
                         gradient_accumulation=1,
                         **kwargs) -> DataLoader:
        """
        Build a dataloader for training or evaluation.

        Args:
            data: Either a path or a list of samples.
            transform: The transform from MTL, which is usually [TransformerSequenceTokenizer, FieldLength('token')]
            training: Whether this method is called on training set.
            device: The device dataloader is intended to work with.
            logger: Logger for printing message indicating progress.
            cache: Whether the dataloader should be cached.
            gradient_accumulation: Gradient accumulation to be passed to sampler builder.
            **kwargs: Additional experimental arguments.
        """
        pass

    def build_optimizer(self, decoder: torch.nn.Module, **kwargs):
        pass

    def build_batch_wise_scheduler(self, decoder: torch.nn.Module, **kwargs):
        pass

    @abstractmethod
    def compute_loss(self,
                     batch: Dict[str, Any],
                     output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                     criterion,
                     ) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
        pass

    @abstractmethod
    def decode_output(self,
                      output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                      mask: torch.BoolTensor,
                      batch: Dict[str, Any], decoder: torch.nn.Module, **kwargs) -> Union[Dict[str, Any], Any]:
        pass

    @abstractmethod
    def update_metrics(self,
                       batch: Dict[str, Any],
                       output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                       prediction: Dict[str, Any],
                       metric: Union[MetricDict, Metric]):
        pass

    # noinspection PyMethodOverriding
    @abstractmethod
    def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
        pass

    @abstractmethod
    def build_metric(self, **kwargs):
        pass

    def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
        pass

    def evaluate_dataloader(self, data: DataLoader, criterion: Callable, output=False, **kwargs):
        pass

    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
                              logger: logging.Logger, devices, **kwargs):
        pass

    # noinspection PyMethodMayBeStatic
    def compute_lens(self, data: Union[List[Dict[str, Any]], str], dataset: TransformableDataset,
                     input_ids='token_input_ids'):
        """

        Args:
            data: Samples to be measured or path to dataset during training time.
            dataset: During training time, use this dataset to measure the length of each sample inside.
            input_ids: Field name corresponds to input ids.

        Returns:

            Length list of this samples

        """
        if dataset.cache is None:
            warnings.warn(f'Caching for the dataset is not enabled, '
                          f'try `dataset.purge_cache()` if possible. The dataset is {dataset}.')
        if isinstance(data, str):
            timer = CountdownTimer(len(dataset))
            for each in dataset:
                timer.log('Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]')
            timer.erase()
        return [len(x[input_ids]) for x in dataset]

    def feed_batch(self,
                   h: torch.FloatTensor,
                   batch: Dict[str, torch.Tensor],
                   mask: torch.BoolTensor,
                   decoder: torch.nn.Module):
        return decoder(h, batch=batch, mask=mask)

    def input_is_flat(self, data) -> bool:
        """
        Check whether the data is flat (meaning that it's only a single sample, not even batched).

        Returns:
            bool: ``True`` to indicate the input data is flat.
        """
        raise NotImplementedError(
            '`input_is_flat()` needs to be implemented for the task component to accept raw input from user.'
        )

    @abstractmethod
    def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List:
        raise NotImplementedError()

    # noinspection PyMethodMayBeStatic
    def transform_batch(self,
                        batch: Dict[str, Any],
                        # inputs: List[List[str]],
                        results: Dict[str, Any] = None,
                        cls_is_bos=False,
                        sep_is_eos=False) -> Dict[str, Any]:
        """
        Let the task transform the batch before feeding the batch into its decoder. The default behavior is to
        adjust the head and tail of tokens, according to ``cls_is_bos``, ``sep_is_eos`` passed in and the two
        settings of the task itself.

        Args:
            batch: A batch of samples.
            results: Predicted results from other tasks which might be useful for this task to utilize. Say a dep task
                uses both token and pos as features, then it will need both tok and pos results to make a batch.
            cls_is_bos: First token in this batch is BOS.
            sep_is_eos: Last token in this batch is EOS.

        Returns:
            A batch.

        """
        if cls_is_bos != self.cls_is_bos or sep_is_eos != self.sep_is_eos:
            batch = copy(batch)
            tokens = self._adjust_token(batch, cls_is_bos, sep_is_eos, 'token')
            delta = len(tokens[0]) - len(batch['token'][0])
            batch['token_length'] = batch['token_length'] + delta
            batch['token'] = tokens
            if 'token_' in batch:
                if isinstance(batch['token_'][0], list):
                    batch['token_'] = self._adjust_token(batch, cls_is_bos, sep_is_eos, 'token_')
                else:
                    batch['token_'] = tokens
        return batch

    def _adjust_token(self, batch, cls_is_bos, sep_is_eos, token_key):
        tokens = []
        for sent in batch[token_key]:
            if cls_is_bos:
                if not self.cls_is_bos:
                    sent = sent[1:]
            elif self.cls_is_bos:
                sent = [BOS] + sent
            if sep_is_eos:
                if not self.sep_is_eos:
                    sent = sent[:-1]
            elif self.sep_is_eos:
                sent = sent + [EOS]
            tokens.append(sent)
        return tokens

    # noinspection PyMethodMayBeStatic
    def build_samples(self, inputs, cls_is_bos=False, sep_is_eos=False):
        """
        Build samples for this task. Called when this task is the first task. Default behaviour is to take inputs as
        list of tokens and put these tokens into a dict per sample.

        Args:
            inputs: Inputs from users, usually a list of lists of tokens.
            cls_is_bos: Insert BOS to the head of each sentence.
            sep_is_eos: Append EOS to the tail of each sentence.

        Returns:
            List of samples.

        """
        if cls_is_bos:
            inputs = [[BOS] + x for x in inputs]
        if sep_is_eos:
            inputs = [x + [EOS] for x in inputs]
        return [{'token': token} for token in inputs]

    def build_tokenizer(self, tokenizer: TransformerSequenceTokenizer):
        """Build a transformer tokenizer for this task.

        Args:
            tokenizer: A tokenizer which is shared but can be adjusted to provide per-task settings.

        Returns:
            A TransformerSequenceTokenizer.

        """
        if tokenizer.cls_is_bos != self.cls_is_bos or tokenizer.sep_is_eos != self.sep_is_eos:
            tokenizer = copy(tokenizer)
            tokenizer.cls_is_bos = self.cls_is_bos
            tokenizer.sep_is_eos = self.sep_is_eos
        return tokenizer

    # noinspection PyMethodMayBeStatic
    def finalize_document(self, doc: Document, task_name: str):
        pass


================================================
FILE: hanlp/components/mtl/tasks/amr.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-12 16:05
import logging
from typing import Dict, Any, List, Union, Iterable, Callable

import torch
from stog.data.dataset_readers.amr_parsing.amr import AMRGraph
from stog.data.dataset_readers.amr_parsing.node_utils import NodeUtilities
from stog.data.dataset_readers.amr_parsing.postprocess.node_restore import NodeRestore
from torch.utils.data import DataLoader

from hanlp_common.constant import CLS
from hanlp.common.dataset import PrefetchDataLoader, SamplerBuilder
from hanlp.common.transform import VocabDict
from hanlp.components.amr.amr_parser.graph_amr_decoder import GraphAbstractMeaningRepresentationDecoder
from hanlp.components.amr.amr_parser.graph_parser import GraphAbstractMeaningRepresentationParser
from hanlp.components.amr.amr_parser.postprocess import PostProcessor
from hanlp.components.amr.amr_parser.work import parse_batch
from hanlp.components.mtl.tasks import Task
from hanlp.datasets.parsing.amr import batchify, get_concepts
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.amr.smatch_eval import SmatchScores, get_amr_utils
from hanlp.metrics.f1 import F1_
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp.utils.io_util import get_resource
from hanlp_common.util import merge_list_of_dict, merge_locals_kwargs


class GraphAbstractMeaningRepresentationParsing(Task, GraphAbstractMeaningRepresentationParser):

    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=1e-3,
                 separate_optimizer=False,
                 cls_is_bos=True,
                 sep_is_eos=False,
                 char2concept_dim=128,
                 cnn_filters=((3, 256),),
                 concept_char_dim=32,
                 concept_dim=300,
                 dropout=0.2,
                 embed_dim=512,
                 eval_every=20,
                 ff_embed_dim=1024,
                 graph_layers=2,
                 inference_layers=4,
                 num_heads=8,
                 rel_dim=100,
                 snt_layers=4,
                 unk_rate=0.33,
                 vocab_min_freq=5,
                 beam_size=8,
                 alpha=0.6,
                 max_time_step=100,
                 amr_version='2.0',
                 **kwargs) -> None:
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()
        utils_dir = get_resource(get_amr_utils(amr_version))
        self.sense_restore = NodeRestore(NodeUtilities.from_json(utils_dir))

    def build_dataloader(self,
                         data,
                         transform: Callable = None,
                         training=False,
                         device=None,
                         logger: logging.Logger = None,
                         cache=False,
                         gradient_accumulation=1,
                         **kwargs) -> DataLoader:
        if isinstance(data, list):
            data = GraphAbstractMeaningRepresentationParser.build_samples(self, data)
        dataset, lens = GraphAbstractMeaningRepresentationParser.build_dataset(self, data, logger=logger,
                                                                               transform=transform, training=training)
        if self.vocabs.mutable:
            GraphAbstractMeaningRepresentationParser.build_vocabs(self, dataset, logger)
        dataloader = PrefetchDataLoader(
            DataLoader(batch_sampler=self.sampler_builder.build(lens, shuffle=training,
                                                                gradient_accumulation=gradient_accumulation),
                       dataset=dataset,
                       collate_fn=merge_list_of_dict,
                       num_workers=0), batchify=self.build_batchify(device, training),
            prefetch=None)
        return dataloader

    def compute_loss(self,
                     batch: Dict[str, Any],
                     output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                     criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
        concept_loss, arc_loss, rel_loss, graph_arc_loss = output
        concept_loss, concept_correct, concept_total = concept_loss
        rel_loss, rel_correct, rel_total = rel_loss
        loss = concept_loss + arc_loss + rel_loss
        return loss

    def decode_output(self,
                      output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                      mask: torch.BoolTensor,
                      batch: Dict[str, Any],
                      decoder: torch.nn.Module, **kwargs) -> Union[Dict[str, Any], Any]:
        return output

    def update_metrics(self,
                       batch: Dict[str, Any],
                       output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                       prediction: Dict[str, Any],
                       metric: Union[MetricDict, Metric]):
        pass

    def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
        return GraphAbstractMeaningRepresentationDecoder(vocabs=self.vocabs, encoder_size=encoder_size, **self.config)

    def build_metric(self, **kwargs):
        return SmatchScores({'Smatch': F1_(0, 0, 0)})

    def input_is_flat(self, data) -> bool:
        return GraphAbstractMeaningRepresentationParser.input_is_flat(self, data)

    def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List:
        pp = PostProcessor(self.vocabs['rel'])
        for concept, relation, score in zip(prediction['concept'], prediction['relation'], prediction['score']):
            amr = pp.to_amr(concept, relation)
            amr_graph = AMRGraph(amr)
            self.sense_restore.restore_graph(amr_graph)
            yield amr_graph

    def evaluate_dataloader(self,
                            data: DataLoader,
                            criterion: Callable,
                            metric=None,
                            output=False,
                            input=None,
                            decoder=None,
                            h=None,
                            split=None,
                            **kwargs):
        # noinspection PyTypeChecker
        GraphAbstractMeaningRepresentationParser.evaluate_dataloader(self, data, logger=None, metric=metric,
                                                                     input=input, model=decoder, h=lambda x: h(x)[0],
                                                                     use_fast=True)

    def feed_batch(self,
                   h: torch.FloatTensor,
                   batch: Dict[str, torch.Tensor],
                   mask: torch.BoolTensor,
                   decoder: torch.nn.Module):
        if decoder.training:
            return super().feed_batch(h, batch, mask, decoder)
        beam_size = self.config.get('beam_size', 8)
        alpha = self.config.get('alpha', 0.6)
        max_time_step = self.config.get('max_time_step', 100)
        res = parse_batch(decoder, batch, beam_size, alpha, max_time_step, h=h)
        return res

    def transform_batch(self, batch: Dict[str, Any], results: Dict[str, Any] = None, cls_is_bos=False,
                        sep_is_eos=False) -> Dict[str, Any]:
        batch = super().transform_batch(batch, results, cls_is_bos, sep_is_eos)
        batch['lemma'] = [[CLS] + x for x in results['lem']]
        copy_seq = merge_list_of_dict(
            [get_concepts({'token': t[1:], 'lemma': l[1:]}, self.vocabs.predictable_concept) for t, l in
             zip(batch['token'], batch['lemma'])])
        copy_seq.pop('token')
        copy_seq.pop('lemma')
        batch.update(copy_seq)
        ret = batchify(batch, self.vocabs, device=batch['token_input_ids'].device)
        return ret


================================================
FILE: hanlp/components/mtl/tasks/constituency.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-29 16:52
import logging
from typing import Dict, Any, List, Union, Iterable, Callable

import torch
from phrasetree.tree import Tree

from hanlp_common.constant import BOS, EOS
from hanlp_common.document import Document
from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser
from torch.utils.data import DataLoader

from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict
from hanlp.components.mtl.tasks import Task
from hanlp.components.parsers.constituency.crf_constituency_model import CRFConstituencyDecoder
from hanlp.components.parsers.constituency.crf_constituency_parser import CRFConstituencyParser
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import merge_locals_kwargs, prefix_match


class CRFConstituencyParsing(Task, CRFConstituencyParser):
    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=None,
                 separate_optimizer=False,
                 cls_is_bos=True,
                 sep_is_eos=True,
                 delete=('', ':', '``', "''", '.', '?', '!', '-NONE-', 'TOP', ',', 'S1'),
                 equal=(('ADVP', 'PRT'),),
                 mbr=True,
                 n_mlp_span=500,
                 n_mlp_label=100,
                 mlp_dropout=.33,
                 no_subcategory=True,
                 **kwargs
                 ) -> None:
        r"""Two-stage CRF Parsing (:cite:`ijcai2020-560`).

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            cls_is_bos: ``True`` to treat the first token as ``BOS``.
            sep_is_eos: ``True`` to treat the last token as ``EOS``.
            delete: Constituencies to be deleted from training and evaluation.
            equal: Constituencies that are regarded as equal during evaluation.
            mbr: ``True`` to enable Minimum Bayes Risk (MBR) decoding (:cite:`smith-smith-2007-probabilistic`).
            n_mlp_span: Number of features for span decoder.
            n_mlp_label: Number of features for label decoder.
            mlp_dropout: Dropout applied to MLPs.
            no_subcategory: Strip out subcategories.
            **kwargs: Not used.
        """
        if isinstance(equal, tuple):
            equal = dict(equal)
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()

    # noinspection DuplicatedCode
    def build_dataloader(self,
                         data,
                         transform: Callable = None,
                         training=False,
                         device=None,
                         logger: logging.Logger = None,
                         cache=False,
                         gradient_accumulation=1,
                         **kwargs) -> DataLoader:
        dataset = CRFConstituencyParsing.build_dataset(self, data, transform)
        dataset.purge_cache()
        if self.vocabs.mutable:
            CRFConstituencyParsing.build_vocabs(self, dataset, logger)
        if isinstance(data, str):
            timer = CountdownTimer(len(dataset))
            # noinspection PyCallByClass
            BiaffineDependencyParser.cache_dataset(self, dataset, timer, training, logger)
        return PadSequenceDataLoader(
            batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training,
                                                     gradient_accumulation=gradient_accumulation),
            device=device,
            dataset=dataset)

    def feed_batch(self,
                   h: torch.FloatTensor,
                   batch: Dict[str, torch.Tensor],
                   mask: torch.BoolTensor,
                   decoder: torch.nn.Module):
        return {
            'output': decoder(h),
            'mask': CRFConstituencyParser.compute_mask(
                self, batch, offset=1 if 'constituency' in batch or batch['token'][0][-1] == EOS else -1)
        }

    def compute_loss(self,
                     batch: Dict[str, Any],
                     output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                     criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
        out, mask = output['output'], output['mask']
        loss, span_probs = CRFConstituencyParser.compute_loss(self, out, batch['chart_id'], mask, crf_decoder=criterion)
        output['span_probs'] = span_probs
        return loss

    def decode_output(self,
                      output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                      mask: torch.BoolTensor,
                      batch: Dict[str, Any],
                      decoder: torch.nn.Module, **kwargs) -> Union[Dict[str, Any], Any]:
        out, mask = output['output'], output['mask']
        tokens = []
        for sent in batch['token']:
            if sent[0] == BOS:
                sent = sent[1:]
            if sent[-1] == EOS:
                sent = sent[:-1]
            tokens.append(sent)
        return CRFConstituencyParser.decode_output(self, out, mask, batch, output.get('span_probs', None),
                                                   decoder=decoder, tokens=tokens)

    def update_metrics(self,
                       batch: Dict[str, Any],
                       output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                       prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):
        return CRFConstituencyParser.update_metrics(self, metric, batch, prediction)

    def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
        return CRFConstituencyDecoder(n_labels=len(self.vocabs.chart), n_hidden=encoder_size)

    def build_metric(self, **kwargs):
        return CRFConstituencyParser.build_metric(self)

    def input_is_flat(self, data) -> bool:
        return CRFConstituencyParser.input_is_flat(self, data)

    def prediction_to_result(self, prediction: List, batch: Dict[str, Any]) -> List:
        return prediction

    def finalize_document(self, doc: Document, task_name: str):
        pos_key = prefix_match('pos', doc)
        pos: List[List[str]] = doc.get(pos_key, None)
        if pos:
            for tree, pos_per_sent in zip(doc[task_name], pos):
                tree: Tree = tree
                offset = 0
                for subtree in tree.subtrees(lambda t: t.height() == 2):
                    tag = subtree.label()
                    if tag == '_':
                        subtree.set_label(pos_per_sent[offset])
                    offset += 1

    def build_samples(self, inputs, cls_is_bos=False, sep_is_eos=False):
        return CRFConstituencyParser.build_samples(self, inputs)


================================================
FILE: hanlp/components/mtl/tasks/dep.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-13 21:39
import logging
from typing import Dict, Any, Union, Iterable, List

import torch
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR
from torch.utils.data import DataLoader

from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict, TransformList
from hanlp.components.mtl.tasks import Task
from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser
from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDecoder
from hanlp.datasets.parsing.loaders.conll_dataset import append_bos
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.constant import EOS
from hanlp_common.util import merge_locals_kwargs


class BiaffineDependencyParsing(Task, BiaffineDependencyParser):
    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=2e-3, separate_optimizer=False,
                 cls_is_bos=True,
                 sep_is_eos=False,
                 punct=False,
                 tree=False,
                 proj=False,
                 n_mlp_arc=500,
                 n_mlp_rel=100,
                 mlp_dropout=.33,
                 mu=.9,
                 nu=.9,
                 epsilon=1e-12,
                 decay=.75,
                 decay_steps=5000,
                 use_pos=False,
                 max_seq_len=None,
                 **kwargs) -> None:
        """Biaffine dependency parsing (:cite:`dozat:17a`).

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            cls_is_bos: ``True`` to treat the first token as ``BOS``.
            sep_is_eos: ``True`` to treat the last token as ``EOS``.
            punct: ``True`` to include punctuations in evaluation.
            tree: ``True`` to enforce tree constraint.
            proj: ``True`` for projective parsing.
            n_mlp_arc: Number of features for arc representation.
            n_mlp_rel: Number of features for rel representation.
            mlp_dropout: Dropout applied to MLPs.
            mu: First coefficient used for computing running averages of gradient and its square in Adam.
            nu: Second coefficient used for computing running averages of gradient and its square in Adam.
            epsilon: Term added to the denominator to improve numerical stability
            decay: Decay rate for exceptional lr scheduler.
            decay_steps: Decay every ``decay_steps`` steps.
            use_pos: Use pos feature.
            max_seq_len: Prune samples longer than this length.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()

    def update_metrics(self, batch: Dict[str, Any],
                       output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                       prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):
        BiaffineDependencyParser.update_metric(self, *prediction, batch['arc'], batch['rel_id'], output[1],
                                               batch.get('punct_mask', None), metric, batch)

    def decode_output(self,
                      output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                      mask: torch.BoolTensor,
                      batch: Dict[str, Any],
                      decoder, **kwargs) -> Union[Dict[str, Any], Any]:
        (arc_scores, rel_scores), mask = output
        return BiaffineDependencyParser.decode(self, arc_scores, rel_scores, mask, batch)

    def compute_loss(self, batch: Dict[str, Any],
                     output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \
            Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
        (arc_scores, rel_scores), mask = output
        return BiaffineDependencyParser.compute_loss(self, arc_scores, rel_scores, batch['arc'], batch['rel_id'], mask,
                                                     criterion,
                                                     batch)

    def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
        return BiaffineDecoder(encoder_size, self.config.n_mlp_arc, self.config.n_mlp_rel, self.config.mlp_dropout,
                               len(self.vocabs.rel))

    def build_metric(self, **kwargs):
        return BiaffineDependencyParser.build_metric(self, **kwargs)

    def build_dataloader(self, data, transform: TransformList = None, training=False, device=None,
                         logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader:
        transform.insert(0, append_bos)
        dataset = BiaffineDependencyParser.build_dataset(self, data, transform)
        dataset.purge_cache()
        if self.vocabs.mutable:
            BiaffineDependencyParser.build_vocabs(self, dataset, logger, transformer=True)
        if isinstance(data, str):
            timer = CountdownTimer(len(dataset))
            BiaffineDependencyParser.cache_dataset(self, dataset, timer, training, logger)
        max_seq_len = self.config.get('max_seq_len', None)
        if max_seq_len and isinstance(data, str):
            dataset.prune(lambda x: len(x['token_input_ids']) > max_seq_len, logger)
        return PadSequenceDataLoader(
            batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset),
                                                     shuffle=training, gradient_accumulation=gradient_accumulation),
            device=device,
            dataset=dataset,
            pad=self.get_pad_dict())

    def feed_batch(self, h: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask: torch.BoolTensor,
                   decoder: torch.nn.Module):
        logits = super().feed_batch(h, batch, mask, decoder)
        mask = mask.clone()
        mask[:, 0] = 0
        return logits, mask

    def build_optimizer(self, decoder: torch.nn.Module, **kwargs):
        config = self.config
        optimizer = Adam(decoder.parameters(),
                         config.lr,
                         (config.mu, config.nu),
                         config.epsilon)
        scheduler = ExponentialLR(optimizer, config.decay ** (1 / config.decay_steps))
        return optimizer, scheduler

    def input_is_flat(self, data) -> bool:
        return BiaffineDependencyParser.input_is_flat(self, data, self.config.use_pos)

    def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List:
        arcs, rels = prediction
        arcs = arcs[:, 1:]  # Skip the ROOT
        rels = rels[:, 1:]
        arcs = arcs.tolist()
        rels = rels.tolist()
        vocab = self.vocabs['rel'].idx_to_token
        for arcs_per_sent, rels_per_sent, tokens in zip(arcs, rels, batch['token']):
            tokens = tokens[1:]
            sent_len = len(tokens)
            result = list(zip(arcs_per_sent[:sent_len], [vocab[r] for r in rels_per_sent[:sent_len]]))
            yield result

    def build_samples(self, inputs, cls_is_bos=False, sep_is_eos=False):
        return [{'FORM': token + ([EOS] if sep_is_eos else [])} for token in inputs]


================================================
FILE: hanlp/components/mtl/tasks/dep_2nd.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-07 14:14
import logging
from typing import Dict, Any, Union, Iterable, Callable, List

import torch
from hanlp_common.util import merge_locals_kwargs
from torch.utils.data import DataLoader

import hanlp.utils.torch_util
from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict
from hanlp.components.mtl.tasks import Task
from hanlp.components.parsers.biaffine.biaffine_2nd_dep import BiaffineSecondaryParser, BiaffineJointDecoder, \
    BiaffineSeparateDecoder
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict


class BiaffineSecondaryDependencyDecoder(torch.nn.Module):
    def __init__(self, hidden_size, config) -> None:
        super().__init__()
        self.decoder = BiaffineJointDecoder(hidden_size, config) if config.joint \
            else BiaffineSeparateDecoder(hidden_size, config)

    def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None):
        if mask is None:
            mask = hanlp.utils.torch_util.lengths_to_mask(batch['token_length'])
        else:
            mask = mask.clone()
        scores = self.decoder(contextualized_embeddings, mask)
        mask[:, 0] = 0
        return scores, mask


class BiaffineSecondaryDependencyParsing(Task, BiaffineSecondaryParser):

    def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None,
                 dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False,
                 lr=2e-3, separate_optimizer=False,
                 punct=False,
                 tree=False,
                 apply_constraint=True,
                 n_mlp_arc=500,
                 n_mlp_rel=100,
                 mlp_dropout=.33,
                 pad_rel=None,
                 joint=True,
                 mu=.9,
                 nu=.9,
                 epsilon=1e-12,
                 cls_is_bos=True,
                 **kwargs) -> None:
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()

    def build_dataloader(self, data, transform: Callable = None, training=False, device=None,
                         logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader:
        dataset = BiaffineSecondaryParser.build_dataset(self, data, transform)
        dataset.purge_cache()
        if self.vocabs.mutable:
            BiaffineSecondaryParser.build_vocabs(self, dataset, logger, transformer=True)
        return PadSequenceDataLoader(
            batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training,
                                                     gradient_accumulation=gradient_accumulation),
            device=device,
            dataset=dataset,
            pad={'arc': 0, 'arc_2nd': False})

    def update_metrics(self, batch: Dict[str, Any],
                       output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                       prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):

        BiaffineSecondaryParser.update_metric(self, *prediction, batch['arc'], batch['rel_id'], output[1],
                                              batch['punct_mask'], metric, batch)

    def decode_output(self, output: Dict[str, Any], batch: Dict[str, Any], decoder, **kwargs) \
            -> Union[Dict[str, Any], Any]:
        return BiaffineSecondaryParser.decode(self, *output[0], output[1], batch)

    def compute_loss(self, batch: Dict[str, Any],
                     output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \
            Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
        return BiaffineSecondaryParser.compute_loss(self, *output[0], batch['arc'], batch['rel_id'], output[1],
                                                    criterion, batch)

    def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
        return BiaffineSecondaryDependencyDecoder(encoder_size, self.config)

    def build_metric(self, **kwargs):
        return BiaffineSecondaryParser.build_metric(self, **kwargs)

    def build_criterion(self, **kwargs):
        return BiaffineSecondaryParser.build_criterion(self, **kwargs)

    def build_optimizer(self, decoder: torch.nn.Module, **kwargs):
        config = self.config
        optimizer = torch.optim.Adam(decoder.parameters(),
                                     config.lr,
                                     (config.mu, config.nu),
                                     config.epsilon)
        return optimizer

    def input_is_flat(self, data) -> bool:
        return BiaffineSecondaryParser.input_is_flat(self, data)

    def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List:
        outputs = []
        return BiaffineSecondaryParser.predictions_to_human(self, prediction, outputs, batch['token'], use_pos=False)


================================================
FILE: hanlp/components/mtl/tasks/lem.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-09 16:37
import logging
from typing import Dict, Any, Union, Iterable, Callable, List

import torch
from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict
from hanlp.components.lemmatizer import TransformerLemmatizer
from hanlp.components.mtl.tasks import Task
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs
from torch.utils.data import DataLoader


class LinearDecoder(torch.nn.Module):
    def __init__(self,
                 hidden_size,
                 num_labels) -> None:
        super().__init__()
        self.classifier = torch.nn.Linear(hidden_size, num_labels)

    def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None):
        return self.classifier(contextualized_embeddings)


class TransformerLemmatization(Task, TransformerLemmatizer):

    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=1e-3,
                 separate_optimizer=False,
                 cls_is_bos=False,
                 sep_is_eos=False,
                 max_seq_len=None,
                 sent_delimiter=None,
                 char_level=False,
                 hard_constraint=False,
                 token_key='token', **kwargs) -> None:
        """ Transition based lemmatization (:cite:`kondratyuk-straka-2019-75`).

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            cls_is_bos: ``True`` to treat the first token as ``BOS``.
            sep_is_eos: ``True`` to treat the last token as ``EOS``.
            max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
            sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
                be split here.
            char_level: Whether the sequence length is measured at char level, which is never the case for
                lemmatization.
            hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
                in a sentence, it will be split at a token anyway.
            token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()

    def build_dataloader(self,
                         data: List[List[str]],
                         transform: Callable = None,
                         training=False,
                         device=None,
                         logger: logging.Logger = None,
                         cache=False,
                         gradient_accumulation=1,
                         **kwargs) -> DataLoader:
        args = dict((k, self.config[k]) for k in
                    ['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint'] if k in self.config)
        dataset = self.build_dataset(data, cache=True, transform=transform, **args)
        dataset.append_transform(self.vocabs)
        if self.vocabs.mutable:
            self.build_vocabs(dataset, logger)
        return PadSequenceDataLoader(
            batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset),
                                                     shuffle=training, gradient_accumulation=gradient_accumulation),
            device=device,
            dataset=dataset)

    def compute_loss(self,
                     batch: Dict[str, Any],
                     output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                     criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
        return TransformerLemmatizer.compute_loss(self, criterion, output, batch['tag_id'], batch['mask'])

    def decode_output(self,
                      output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                      mask: torch.BoolTensor,
                      batch: Dict[str, Any],
                      decoder,
                      **kwargs) -> Union[Dict[str, Any], Any]:
        return TransformerLemmatizer.decode_output(self, output, mask, batch, decoder)

    def update_metrics(self,
                       batch: Dict[str, Any],
                       output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                       prediction: Dict[str, Any],
                       metric: Union[MetricDict, Metric]):
        return TransformerLemmatizer.update_metrics(self, metric, output, batch['tag_id'], batch['mask'])

    def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
        return LinearDecoder(encoder_size, len(self.vocabs['tag']))

    def build_metric(self, **kwargs):
        return TransformerLemmatizer.build_metric(self, **kwargs)

    def input_is_flat(self, data) -> bool:
        return TransformerLemmatizer.input_is_flat(self, data)

    def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> Union[List, Dict]:
        return TransformerLemmatizer.prediction_to_human(self, prediction, self.vocabs['tag'].idx_to_token, batch,
                                                         token=batch['token'])


================================================
FILE: hanlp/components/mtl/tasks/ner/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-03 14:34


================================================
FILE: hanlp/components/mtl/tasks/ner/biaffine_ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-05 01:49
import logging
from copy import copy
from typing import Dict, Any, Union, Iterable, List

import torch
from torch.utils.data import DataLoader

from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict, TransformList
from hanlp.components.mtl.tasks import Task
from hanlp.components.ner.biaffine_ner.biaffine_ner import BiaffineNamedEntityRecognizer
from hanlp.components.ner.biaffine_ner.biaffine_ner_model import BiaffineNamedEntityRecognitionDecoder
from hanlp.datasets.ner.loaders.json_ner import unpack_ner
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs


class BiaffineNamedEntityRecognition(Task, BiaffineNamedEntityRecognizer):

    def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None,
                 dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False,
                 lr=None, separate_optimizer=False,
                 doc_level_offset=True, is_flat_ner=True, tagset=None, ret_tokens=' ',
                 ffnn_size=150, loss_reduction='mean', **kwargs) -> None:
        """An implementation of Named Entity Recognition as Dependency Parsing (:cite:`yu-etal-2020-named`). It treats
        every possible span as a candidate of entity and predicts its entity label. Non-entity spans are assigned NULL
        label to be excluded. The label prediction is done with a biaffine layer (:cite:`dozat:17a`). As it makes no
        assumption about the spans, it naturally supports flat NER and nested NER.

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level.
            is_flat_ner: ``True`` for flat NER, otherwise nested NER.
            tagset: Optional tagset to prune entities outside of this tagset from datasets.
            ret_tokens: A delimiter between tokens in entities so that the surface form of an entity can be rebuilt.
            ffnn_size: Feedforward size for MLPs extracting the head/tail representations.
            loss_reduction: The loss reduction used in aggregating losses.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()

    def update_metrics(self, batch: Dict[str, Any],
                       output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                       prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):
        BiaffineNamedEntityRecognizer.update_metrics(self, batch, prediction, metric)

    def decode_output(self,
                      output: Dict[str, Any],
                      mask: torch.BoolTensor,
                      batch: Dict[str, Any],
                      decoder,
                      **kwargs) -> Union[Dict[str, Any], Any]:
        return self.get_pred_ner(batch['token'], output['candidate_ner_scores'])

    def compute_loss(self, batch: Dict[str, Any],
                     output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \
            Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
        return output['loss']

    def build_dataloader(self, data,
                         transform: TransformList = None,
                         training=False,
                         device=None,
                         logger: logging.Logger = None,
                         gradient_accumulation=1,
                         **kwargs) -> DataLoader:
        transform = copy(transform)
        transform.append(unpack_ner)
        dataset = BiaffineNamedEntityRecognizer.build_dataset(self, data, self.vocabs, transform)
        dataset.purge_cache()
        if self.vocabs.mutable:
            BiaffineNamedEntityRecognizer.build_vocabs(self, dataset, logger, self.vocabs)
        return PadSequenceDataLoader(
            batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training,
                                                     gradient_accumulation=gradient_accumulation),
            device=device,
            dataset=dataset)

    def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
        return BiaffineNamedEntityRecognitionDecoder(encoder_size, self.config.ffnn_size, len(self.vocabs.label),
                                                     self.config.loss_reduction)

    def build_metric(self, **kwargs):
        return BiaffineNamedEntityRecognizer.build_metric(self, **kwargs)

    def input_is_flat(self, data) -> bool:
        return BiaffineNamedEntityRecognizer.input_is_flat(data)

    def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List:
        results = []
        BiaffineNamedEntityRecognizer.prediction_to_result(batch['token'], prediction, results,
                                                           ret_tokens=self.config.get('ret_tokens', ' '))
        return results


================================================
FILE: hanlp/components/mtl/tasks/ner/tag_ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-03 14:35
import logging
from typing import Union, List, Dict, Any, Iterable, Callable, Set, Sequence

import torch
from hanlp_trie import DictInterface
from torch.utils.data import DataLoader

from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict
from hanlp.components.mtl.tasks import Task
from hanlp.components.ner.transformer_ner import TransformerNamedEntityRecognizer
from hanlp.layers.crf.crf import CRF
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs


class LinearCRFDecoder(torch.nn.Module):
    def __init__(self,
                 hidden_size,
                 num_labels,
                 secondary_encoder=None,
                 crf=False) -> None:
        super().__init__()
        self.secondary_encoder = secondary_encoder
        self.classifier = torch.nn.Linear(hidden_size, num_labels)
        self.crf = CRF(num_labels) if crf else None

    def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None):
        if self.secondary_encoder:
            contextualized_embeddings = self.secondary_encoder(contextualized_embeddings, mask=mask)
        return self.classifier(contextualized_embeddings)


class TaggingNamedEntityRecognition(Task, TransformerNamedEntityRecognizer):

    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=1e-3,
                 separate_optimizer=False,
                 max_seq_len=None,
                 sent_delimiter=None,
                 char_level=False,
                 hard_constraint=False,
                 tagging_scheme=None,
                 crf=False,
                 delimiter_in_entity=None,
                 merge_types: List[str] = None,
                 secondary_encoder=None,
                 token_key='token',
                 dict_whitelist: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None,
                 dict_blacklist: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None,
                 dict_tags: Union[
                     DictInterface, Union[Dict[Union[str, Sequence[str]], Union[str, Sequence[str]]]]] = None,
                 **kwargs) -> None:
        r"""A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for
        NER task. It can utilize whitelist gazetteers which is dict mapping from entity name to entity type.
        During decoding, it performs longest-prefix-matching of these words to override the prediction from
        underlying statistical model. It also uses a blacklist to mask out mis-predicted  entities.

        .. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can
            do and what it can't do. The tutorial in `this book <http://nlp.hankcs.com/book.php>`_ can be very helpful.

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
            sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
                be split here.
            char_level: Whether the sequence length is measured at char level, which is never the case for
                lemmatization.
            hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
                in a sentence, it will be split at a token anyway.
            token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL.
            crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
            delimiter_in_entity: The delimiter between tokens in entity, which is used to rebuild entity by joining
                tokens during decoding.
            merge_types: The types of consecutive entities to be merged.
            secondary_encoder: An optional secondary encoder to provide enhanced representation by taking the hidden
                states from the main encoder as input.
            token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL.
            dict_whitelist: A :class:`dict` or a :class:`~hanlp_trie.dictionary.DictInterface` of gazetteers to be
                included into the final results.
            dict_blacklist: A :class:`set` or a :class:`~hanlp_trie.dictionary.DictInterface` of badcases to be
                excluded from the final results.
            **kwargs:
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()
        self.secondary_encoder = secondary_encoder
        self.dict_whitelist = dict_whitelist
        self.dict_blacklist = dict_blacklist
        self.dict_tags = dict_tags

    def build_dataloader(self,
                         data,
                         transform: Callable = None,
                         training=False,
                         device=None,
                         logger: logging.Logger = None,
                         cache=False,
                         gradient_accumulation=1,
                         **kwargs) -> DataLoader:
        args = dict((k, self.config[k]) for k in
                    ['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint'] if k in self.config)
        dataset = self.build_dataset(data, cache=cache, transform=transform, **args)
        dataset.append_transform(self.vocabs)
        dataset.purge_cache()
        if self.vocabs.mutable:
            self.build_vocabs(dataset, logger)
        return PadSequenceDataLoader(
            batch_sampler=self.sampler_builder.build(
                self.compute_lens(data, dataset),
                shuffle=training, gradient_accumulation=gradient_accumulation),
            device=device,
            dataset=dataset)

    def compute_loss(self,
                     batch: Dict[str, Any],
                     output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                     criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
        return TransformerNamedEntityRecognizer.compute_loss(self, criterion, output, batch['tag_id'], batch['mask'])

    def decode_output(self,
                      output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                      mask: torch.BoolTensor,
                      batch: Dict[str, Any],
                      decoder,
                      **kwargs) -> Union[Dict[str, Any], Any]:
        return TransformerNamedEntityRecognizer.decode_output(self, output, batch['mask'], batch, decoder)

    def update_metrics(self,
                       batch: Dict[str, Any],
                       output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                       prediction: Dict[str, Any],
                       metric: Union[MetricDict, Metric]):
        return TransformerNamedEntityRecognizer.update_metrics(self, metric, output, batch['tag_id'], batch['mask'],
                                                               batch, prediction)

    def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
        return LinearCRFDecoder(encoder_size, len(self.vocabs['tag']), self.secondary_encoder, self.config.crf)

    def build_metric(self, **kwargs):
        return TransformerNamedEntityRecognizer.build_metric(self, **kwargs)

    def input_is_flat(self, data) -> bool:
        return TransformerNamedEntityRecognizer.input_is_flat(self, data)

    def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> Union[List, Dict]:
        return TransformerNamedEntityRecognizer.prediction_to_human(self, prediction, self.vocabs['tag'].idx_to_token,
                                                                    batch)


================================================
FILE: hanlp/components/mtl/tasks/pos.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-10-19 18:56
import logging
from typing import Dict, Any, Union, Iterable, Callable, List, Tuple, Sequence

import torch
from torch.utils.data import DataLoader

from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict
from hanlp.components.mtl.tasks import Task
from hanlp.components.taggers.transformers.transformer_tagger import TransformerTagger
from hanlp.layers.crf.crf import CRF
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs
from hanlp_trie import DictInterface, TrieDict


class LinearCRFDecoder(torch.nn.Module):
    def __init__(self,
                 hidden_size,
                 num_labels,
                 crf=False) -> None:
        """A linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer on top of it.

        Args:
            hidden_size: Size of hidden states.
            num_labels: Size of tag set.
            crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
        """
        super().__init__()
        self.classifier = torch.nn.Linear(hidden_size, num_labels)
        self.crf = CRF(num_labels) if crf else None

    def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None):
        """

        Args:
            contextualized_embeddings: Hidden states for contextual layer.
            batch: A dict of a batch.
            mask: Mask for tokens.

        Returns:
            Logits. Users are expected to call ``CRF.decode`` on these emissions during decoding and ``CRF.forward``
            during training.

        """
        return self.classifier(contextualized_embeddings)


class TransformerTagging(Task, TransformerTagger):

    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=1e-3,
                 separate_optimizer=False,
                 cls_is_bos=False,
                 sep_is_eos=False,
                 max_seq_len=None,
                 sent_delimiter=None,
                 char_level=False,
                 hard_constraint=False,
                 crf=False,
                 token_key='token',
                 dict_tags: Union[
                     DictInterface, Union[Dict[Union[str, Sequence[str]], Union[str, Sequence[str]]]]] = None,
                 **kwargs) -> None:
        """A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for
        any tagging tasks including PoS tagging and many others. It also features with a custom dictionary ``dict_tags``
        to perform ``longest-prefix-matching`` which replaces matched tokens with given tags.


        .. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can
            do and what it can't do. The tutorial in `this book <http://nlp.hankcs.com/book.php>`_ can be very helpful.

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            cls_is_bos: ``True`` to treat the first token as ``BOS``.
            sep_is_eos: ``True`` to treat the last token as ``EOS``.
            max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
            sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
                be split here.
            char_level: Whether the sequence length is measured at char level, which is never the case for
                lemmatization.
            hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
                in a sentence, it will be split at a token anyway.
            crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
            token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL.
            dict_tags: A custom dictionary to override predicted tags by performing longest-prefix-matching.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()
        self.dict_tags = dict_tags

    def build_dataloader(self,
                         data,
                         transform: Callable = None,
                         training=False,
                         device=None,
                         logger: logging.Logger = None,
                         cache=False,
                         gradient_accumulation=1,
                         **kwargs) -> DataLoader:
        args = dict((k, self.config[k]) for k in
                    ['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint'] if k in self.config)
        dataset = self.build_dataset(data, cache=True, transform=transform, **args)
        dataset.append_transform(self.vocabs)
        if self.vocabs.mutable:
            self.build_vocabs(dataset, logger)
        return PadSequenceDataLoader(
            batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset),
                                                     shuffle=training, gradient_accumulation=gradient_accumulation),
            device=device,
            dataset=dataset)

    def compute_loss(self,
                     batch: Dict[str, Any],
                     output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                     criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
        return TransformerTagger.compute_loss(self, criterion, output, batch['tag_id'], batch['mask'])

    def decode_output(self,
                      output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                      mask: torch.BoolTensor,
                      batch: Dict[str, Any],
                      decoder,
                      **kwargs) -> Union[Dict[str, Any], Any]:
        return TransformerTagger.decode_output(self, output, mask, batch, decoder)

    def update_metrics(self,
                       batch: Dict[str, Any],
                       output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                       prediction: Dict[str, Any],
                       metric: Union[MetricDict, Metric]):
        return TransformerTagger.update_metrics(self, metric, output, batch['tag_id'], batch['mask'])

    def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
        return LinearCRFDecoder(encoder_size, len(self.vocabs['tag']), self.config.crf)

    def build_metric(self, **kwargs):
        return TransformerTagger.build_metric(self, **kwargs)

    def input_is_flat(self, data) -> bool:
        return TransformerTagger.input_is_flat(self, data)

    def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> Union[List, Dict]:
        return TransformerTagger.prediction_to_human(self, prediction, self.vocabs['tag'].idx_to_token, batch)


================================================
FILE: hanlp/components/mtl/tasks/sdp.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-13 21:39
import logging
from typing import Dict, Any, Union, Iterable, List

import torch
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR
from torch.utils.data import DataLoader

from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict, TransformList
from hanlp.components.mtl.tasks import Task
from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDecoder
from hanlp.components.parsers.biaffine.biaffine_sdp import BiaffineSemanticDependencyParser
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import merge_locals_kwargs


class BiaffineSemanticDependencyParsing(Task, BiaffineSemanticDependencyParser):
    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=2e-3, separate_optimizer=False,
                 punct=False,
                 tree=True,
                 pad_rel=None,
                 apply_constraint=False,
                 single_root=True,
                 no_zero_head=None,
                 n_mlp_arc=500,
                 n_mlp_rel=100,
                 mlp_dropout=.33,
                 mu=.9,
                 nu=.9,
                 epsilon=1e-12,
                 decay=.75,
                 decay_steps=5000,
                 cls_is_bos=True,
                 use_pos=False,
                 **kwargs) -> None:
        r"""Implementation of "Stanford's graph-based neural dependency parser at
        the conll 2017 shared task" (:cite:`dozat2017stanford`) and "Establishing Strong Baselines for the New Decade"
        (:cite:`he-choi-2019`).

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            punct: ``True`` to include punctuations in evaluation.
            pad_rel: Padding token for relations.
            apply_constraint: Enforce constraints (see following parameters).
            single_root: Force single root.
            no_zero_head: Every token has at least one head.
            n_mlp_arc: Number of features for arc representation.
            n_mlp_rel: Number of features for rel representation.
            mlp_dropout: Dropout applied to MLPs.
            mu: First coefficient used for computing running averages of gradient and its square in Adam.
            nu: Second coefficient used for computing running averages of gradient and its square in Adam.
            epsilon: Term added to the denominator to improve numerical stability
            decay: Decay rate for exceptional lr scheduler.
            decay_steps: Decay every ``decay_steps`` steps.
            cls_is_bos: ``True`` to treat the first token as ``BOS``.
            use_pos: Use pos feature.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()

    def update_metrics(self, batch: Dict[str, Any],
                       output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                       prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):
        BiaffineSemanticDependencyParser.update_metric(self, *prediction, batch['arc'], batch['rel_id'], output[1],
                                                       output[-1], metric, batch)

    def decode_output(self,
                      output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                      mask: torch.BoolTensor,
                      batch: Dict[str, Any],
                      decoder, **kwargs) -> Union[Dict[str, Any], Any]:
        (arc_scores, rel_scores), mask, punct_mask = output
        return BiaffineSemanticDependencyParser.decode(self, arc_scores, rel_scores, mask, batch)

    def compute_loss(self, batch: Dict[str, Any],
                     output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \
            Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
        (arc_scores, rel_scores), mask, punct_mask = output
        return BiaffineSemanticDependencyParser.compute_loss(self, arc_scores, rel_scores, batch['arc'],
                                                             batch['rel_id'], mask, criterion,
                                                             batch)

    def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
        return BiaffineDecoder(encoder_size, self.config.n_mlp_arc, self.config.n_mlp_rel, self.config.mlp_dropout,
                               len(self.vocabs.rel))

    def build_metric(self, **kwargs):
        return BiaffineSemanticDependencyParser.build_metric(self, **kwargs)

    def build_dataloader(self, data, transform: TransformList = None, training=False, device=None,
                         logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader:
        dataset = BiaffineSemanticDependencyParser.build_dataset(self, data, transform)
        dataset.purge_cache()
        if self.vocabs.mutable:
            BiaffineSemanticDependencyParser.build_vocabs(self, dataset, logger, transformer=True)
        if isinstance(data, str):
            timer = CountdownTimer(len(dataset))
            BiaffineSemanticDependencyParser.cache_dataset(self, dataset, timer, training, logger)
        return PadSequenceDataLoader(
            batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset),
                                                     shuffle=training, gradient_accumulation=gradient_accumulation),
            device=device,
            dataset=dataset,
            pad=self.get_pad_dict())

    def feed_batch(self, h: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask: torch.BoolTensor,
                   decoder: torch.nn.Module):
        logits = super().feed_batch(h, batch, mask, decoder)
        arc_scores = logits[0]
        mask = mask.clone()
        mask[:, 0] = 0
        mask = self.convert_to_3d_mask(arc_scores, mask)
        punct_mask = self.convert_to_3d_puncts(batch.get('punct_mask', None), mask)
        return logits, mask, punct_mask

    def build_optimizer(self, decoder: torch.nn.Module, **kwargs):
        config = self.config
        optimizer = Adam(decoder.parameters(),
                         config.lr,
                         (config.mu, config.nu),
                         config.epsilon)
        scheduler = ExponentialLR(optimizer, config.decay ** (1 / config.decay_steps))
        return optimizer, scheduler

    def input_is_flat(self, data) -> bool:
        return BiaffineSemanticDependencyParser.input_is_flat(self, data, self.config.use_pos)

    def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List:
        arcs, rels = prediction
        arcs = arcs[:, 1:, :]  # Skip the ROOT
        rels = rels[:, 1:, :]
        arcs = arcs.tolist()
        rels = rels.tolist()
        vocab = self.vocabs['rel'].idx_to_token
        for arcs_per_sent, rels_per_sent, tokens in zip(arcs, rels, batch['token']):
            tokens = tokens[1:]
            sent_len = len(tokens)
            result = []
            for a, r in zip(arcs_per_sent[:sent_len], rels_per_sent[:sent_len]):
                heads = [i for i in range(sent_len + 1) if a[i]]
                deprels = [vocab[r[i]] for i in range(sent_len + 1) if a[i]]
                result.append(list(zip(heads, deprels)))
            yield result

    def build_samples(self, inputs, cls_is_bos=False, sep_is_eos=False):
        return BiaffineSemanticDependencyParser.build_samples(self, inputs, self.config.use_pos)


================================================
FILE: hanlp/components/mtl/tasks/srl/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-04 16:49


================================================
FILE: hanlp/components/mtl/tasks/srl/bio_srl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-04 16:50
import logging
from typing import Dict, Any, List, Union, Iterable, Callable

import torch
from torch.utils.data import DataLoader

from hanlp.common.dataset import PadSequenceDataLoader, SamplerBuilder
from hanlp.common.transform import VocabDict
from hanlp.components.mtl.tasks import Task
from hanlp.components.srl.span_bio.baffine_tagging import BiaffineTaggingDecoder
from hanlp.components.srl.span_bio.span_bio import SpanBIOSemanticRoleLabeler
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs
import torch.nn.functional as F


class SpanBIOSemanticRoleLabeling(Task, SpanBIOSemanticRoleLabeler):

    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=None,
                 separate_optimizer=False,
                 cls_is_bos=False,
                 sep_is_eos=False,
                 crf=False,
                 n_mlp_rel=300,
                 mlp_dropout=0.2,
                 loss_reduction='mean',
                 doc_level_offset=True,
                 **kwargs) -> None:
        """A span based Semantic Role Labeling task using BIO scheme for tagging the role of each token. Given a
        predicate and a token, it uses biaffine (:cite:`dozat:17a`) to predict their relations as one of BIO-ROLE.

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            cls_is_bos: ``True`` to treat the first token as ``BOS``.
            sep_is_eos: ``True`` to treat the last token as ``EOS``.
            crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
            n_mlp_rel: Output size of MLPs for representing predicate and tokens.
            mlp_dropout: Dropout applied to MLPs.
            loss_reduction: Loss reduction for aggregating losses.
            doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()

    def build_dataloader(self, data, transform: Callable = None, training=False, device=None,
                         logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader:
        dataset = self.build_dataset(data, transform=[transform, self.vocabs])
        dataset.purge_cache()
        if self.vocabs.mutable:
            SpanBIOSemanticRoleLabeler.build_vocabs(self, dataset, logger)
        return PadSequenceDataLoader(
            batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training,
                                                     gradient_accumulation=gradient_accumulation),
            device=device,
            dataset=dataset)

    def compute_loss(self, batch: Dict[str, Any],
                     output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \
            Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
        pred, mask = output
        return SpanBIOSemanticRoleLabeler.compute_loss(self, criterion, pred, batch['srl_id'], mask)

    def decode_output(self,
                      output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                      mask: torch.BoolTensor,
                      batch: Dict[str, Any],
                      decoder: torch.nn.Module, **kwargs) -> Union[Dict[str, Any], Any]:
        pred, mask = output
        return SpanBIOSemanticRoleLabeler.decode_output(self, pred, mask, batch, decoder)

    def update_metrics(self, batch: Dict[str, Any],
                       output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                       prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):
        return SpanBIOSemanticRoleLabeler.update_metrics(self, metric, prediction, batch)

    def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
        return BiaffineTaggingDecoder(
            len(self.vocabs['srl']),
            encoder_size,
            self.config.n_mlp_rel,
            self.config.mlp_dropout,
            self.config.crf,
        )

    def feed_batch(self, h: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask: torch.BoolTensor,
                   decoder: torch.nn.Module):
        if not h.numel():  # No tokens, don't bother to run the decoder
            return [], None
        pred = decoder(h)
        mask3d = self.compute_mask(mask)
        if self.config.crf:
            token_index = mask3d[0]
            pred = pred.flatten(end_dim=1)[token_index]
            pred = F.log_softmax(pred, dim=-1)
        return pred, mask3d

    def build_metric(self, **kwargs):
        return SpanBIOSemanticRoleLabeler.build_metric(self)

    def input_is_flat(self, data) -> bool:
        return SpanBIOSemanticRoleLabeler.input_is_flat(self, data)

    def prediction_to_result(self, prediction: List, batch: Dict[str, Any]) -> List:
        yield from SpanBIOSemanticRoleLabeler.prediction_to_result(self, prediction, batch)


================================================
FILE: hanlp/components/mtl/tasks/srl/rank_srl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-05 15:43
import logging
from typing import Union, List, Dict, Any, Iterable, Callable

import torch
from torch.utils.data import DataLoader

from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict
from hanlp.components.mtl.tasks import Task
from hanlp.components.srl.span_rank.span_rank import SpanRankingSemanticRoleLabeler
from hanlp.components.srl.span_rank.span_ranking_srl_model import SpanRankingSRLDecoder
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs


class SpanRankingSemanticRoleLabeling(Task, SpanRankingSemanticRoleLabeler):

    def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None,
                 dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False,
                 lr=1e-3, separate_optimizer=False,
                 lexical_dropout=0.5,
                 dropout=0.2,
                 span_width_feature_size=20,
                 ffnn_size=150,
                 ffnn_depth=2,
                 argument_ratio=0.8,
                 predicate_ratio=0.4,
                 max_arg_width=30,
                 mlp_label_size=100,
                 enforce_srl_constraint=False,
                 use_gold_predicates=False,
                 doc_level_offset=True,
                 use_biaffine=False,
                 loss_reduction='mean',
                 with_argument=' ',
                 **kwargs) -> None:
        r""" An implementation of "Jointly Predicting Predicates and Arguments in Neural Semantic Role Labeling"
        (:cite:`he-etal-2018-jointly`). It generates candidates triples of (predicate, arg_start, arg_end) and rank them.

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            lexical_dropout: Dropout applied to hidden states of encoder.
            dropout: Dropout used for other layers except the encoder.
            span_width_feature_size: Span width feature size.
            ffnn_size: Feedforward size.
            ffnn_depth: Number of layers of feedforward MLPs.
            argument_ratio: Ratio of candidate arguments over number of tokens.
            predicate_ratio: Ratio of candidate predicates over number of tokens.
            max_arg_width: Maximum argument width.
            mlp_label_size: Feature size for label representation.
            enforce_srl_constraint: Enforce SRL constraints (number of core ARGs etc.).
            use_gold_predicates: Use gold predicates instead of predicting them.
            doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level.
            use_biaffine: ``True`` to use biaffine (:cite:`dozat:17a`) instead of lineary layer for label prediction.
            loss_reduction: The loss reduction used in aggregating losses.
            with_argument: The delimiter between tokens in arguments to be used for joining tokens for outputs.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()

    def build_dataloader(self, data, transform: Callable = None, training=False, device=None,
                         logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader:
        dataset = self.build_dataset(data, isinstance(data, list), logger, transform)
        dataset.purge_cache()
        return PadSequenceDataLoader(
            batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training,
                                                     gradient_accumulation=gradient_accumulation),
            device=device,
            dataset=dataset)

    def update_metrics(self, batch: Dict[str, Any],
                       output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                       prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):
        return SpanRankingSemanticRoleLabeler.update_metrics(self, batch, {'prediction': prediction},
                                                             tuple(metric.values()))

    def decode_output(self,
                      output: Dict[str, Any],
                      mask: torch.BoolTensor,
                      batch: Dict[str, Any],
                      decoder, **kwargs) -> Union[Dict[str, Any], Any]:
        return SpanRankingSemanticRoleLabeler.decode_output(self, output, batch)

    def compute_loss(self, batch: Dict[str, Any],
                     output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \
            Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
        return output['loss']

    def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
        return SpanRankingSRLDecoder(encoder_size, len(self.vocabs.srl_label), self.config)

    def build_metric(self, **kwargs):
        predicate_f1, end_to_end_f1 = SpanRankingSemanticRoleLabeler.build_metric(self, **kwargs)
        return MetricDict({'predicate': predicate_f1, 'e2e': end_to_end_f1})

    def build_criterion(self, **kwargs):
        pass

    def input_is_flat(self, data) -> bool:
        return SpanRankingSemanticRoleLabeler.input_is_flat(self, data)

    def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List:
        return SpanRankingSemanticRoleLabeler.format_dict_to_results(batch['token'], prediction, exclusive_offset=True,
                                                                     with_predicate=True,
                                                                     with_argument=self.config.get('with_argument',
                                                                                                   ' '),
                                                                     label_first=True)


================================================
FILE: hanlp/components/mtl/tasks/tok/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-11 16:34

================================================
FILE: hanlp/components/mtl/tasks/tok/reg_tok.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-02 16:51
import logging
from typing import Union, List, Dict, Any, Iterable, Tuple

import torch
from hanlp_common.util import merge_locals_kwargs
from torch import Tensor
from torch.utils.data import DataLoader

import hanlp.utils.torch_util
from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import FieldLength, TransformList
from hanlp.components.mtl.tasks import Task
from hanlp.datasets.tokenization.loaders.txt import TextTokenizingDataset
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.layers.transformers.pt_imports import PreTrainedTokenizer
from hanlp.metrics.chunking.binary_chunking_f1 import BinaryChunkingF1
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer


def generate_token_span_tuple(sample: dict):
    prefix_mask = sample.get('text_prefix_mask', None)
    if prefix_mask:
        sample['span_tuple'] = spans = []
        previous_prefix = 0
        prefix_mask_ = prefix_mask[1:-1]
        for i, mask in enumerate(prefix_mask_):
            if i and mask:
                spans.append((previous_prefix, i))
                previous_prefix = i
        spans.append((previous_prefix, len(prefix_mask_)))
    return sample


class RegressionTokenizingDecoder(torch.nn.Linear):

    def __init__(self, in_features: int, out_features: int = 1, bias: bool = ...) -> None:
        super().__init__(in_features, out_features, bias)

    # noinspection PyMethodOverriding
    def forward(self, input: Tensor, **kwargs) -> Tensor:
        return super().forward(input[:, 1:-1, :]).squeeze_(-1)


class RegressionTokenization(Task):

    def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None,
                 dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=True, lr=1e-3, separate_optimizer=False, delimiter=None,
                 max_seq_len=None, sent_delimiter=None) -> None:
        super().__init__(**merge_locals_kwargs(locals()))

    def build_criterion(self, **kwargs):
        return torch.nn.BCEWithLogitsLoss(reduction='mean')

    def build_metric(self, **kwargs):
        return BinaryChunkingF1()

    # noinspection PyMethodOverriding
    def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
        return RegressionTokenizingDecoder(encoder_size)

    def predict(self, data: Union[str, List[str]], batch_size: int = None, **kwargs):
        pass

    def build_dataloader(self,
                         data,
                         transform: TransformList = None,
                         training=False,
                         device=None,
                         logger: logging.Logger = None,
                         tokenizer: PreTrainedTokenizer = None,
                         **kwargs) -> DataLoader:
        assert tokenizer
        dataset = TextTokenizingDataset(data, cache=True, delimiter=self.config.sent_delimiter,
                                        generate_idx=isinstance(data, list),
                                        max_seq_len=self.config.max_seq_len,
                                        sent_delimiter=self.config.sent_delimiter,
                                        transform=[
                                            TransformerSequenceTokenizer(tokenizer,
                                                                         'text',
                                                                         ret_prefix_mask=True,
                                                                         ret_subtokens=True,
                                                                         ),
                                            FieldLength('text_input_ids', 'text_input_ids_length', delta=-2),
                                            generate_token_span_tuple])
        return PadSequenceDataLoader(
            batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset, 'text_input_ids'),
                                                     shuffle=training),
            device=device,
            dataset=dataset)

    def decode_output(self,
                      output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                      batch: Dict[str, Any], **kwargs) -> List[Tuple[int, int]]:
        spans = BinaryChunkingF1.decode_spans(output > 0, batch['text_input_ids_length'])
        return spans

    def update_metrics(self, batch: Dict[str, Any],
                       output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                       prediction: List[Tuple[int, int]], metric: BinaryChunkingF1):
        metric.update(prediction, batch['span_tuple'])

    def compute_loss(self, batch: Dict[str, Any],
                     output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion):
        mask = hanlp.utils.torch_util.lengths_to_mask(batch['text_input_ids_length'])
        return criterion(output[mask], batch['text_prefix_mask'][:, 1:-1][mask].to(torch.float))


================================================
FILE: hanlp/components/mtl/tasks/tok/tag_tok.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-11 16:35
import logging
from typing import Dict, Any, Union, Iterable, List, Set

import torch
from torch.utils.data import DataLoader

from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict, TransformList
from hanlp.components.mtl.tasks import Task
from hanlp.components.tokenizers.transformer import TransformerTaggingTokenizer
from hanlp.layers.crf.crf import CRF
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
from hanlp_common.util import merge_locals_kwargs
from hanlp_trie import DictInterface, TrieDict


class LinearCRFDecoder(torch.nn.Module):
    def __init__(self,
                 hidden_size,
                 num_labels,
                 crf=False) -> None:
        super().__init__()
        self.classifier = torch.nn.Linear(hidden_size, num_labels)
        self.crf = CRF(num_labels) if crf else None

    def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None):
        return self.classifier(contextualized_embeddings[:, 1:-1, :])


class TaggingTokenization(Task, TransformerTaggingTokenizer):

    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=1e-3, separate_optimizer=False,
                 cls_is_bos=True,
                 sep_is_eos=True,
                 delimiter=None,
                 max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False,
                 transform=None,
                 tagging_scheme='BMES',
                 crf=False,
                 token_key='token',
                 dict_force: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None,
                 dict_combine: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None,
                 **kwargs) -> None:
        """Tokenization which casts a chunking problem into a tagging problem.
        This task has to create batch of tokens containing both [CLS] and [SEP] since it's usually the first task
        and later tasks might need them.

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            cls_is_bos: ``True`` to treat the first token as ``BOS``.
            sep_is_eos: ``True`` to treat the last token as ``EOS``.
            delimiter: Delimiter used to split a line in the corpus.
            max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
            sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
                be split here.
            char_level: Whether the sequence length is measured at char level.
            hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
                in a sentence, it will be split at a token anyway.
            transform: An optional transform to be applied to samples. Usually a character normalization transform is
                passed in.
            tagging_scheme: Either ``BMES`` or ``BI``.
            crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
            token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs, excludes=(
            'self', 'kwargs', '__class__', 'dict_force', 'dict_combine')))  # avoid to config
        self.transform = transform
        self.vocabs = VocabDict()
        self.dict_force = dict_force
        self.dict_combine = dict_combine

    def build_dataloader(self, data, transform: TransformList = None, training=False, device=None,
                         logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader:
        args = dict((k, self.config[k]) for k in
                    ['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint'] if k in self.config)
        # We only need those transforms before TransformerTokenizer
        transformer_index = transform.index_by_type(TransformerSequenceTokenizer)
        assert transformer_index is not None
        transform = transform[:transformer_index + 1]
        if self.transform:
            transform.insert(0, self.transform)
        transform.append(self.last_transform())
        dataset = self.build_dataset(data, cache=cache, transform=transform, **args)
        dataset.purge_cache()
        if self.vocabs.mutable:
            self.build_vocabs(dataset, logger)
        return PadSequenceDataLoader(
            batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset),
                                                     shuffle=training, gradient_accumulation=gradient_accumulation),
            device=device,
            dataset=dataset)

    def compute_loss(self,
                     batch: Dict[str, Any],
                     output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                     criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
        return TransformerTaggingTokenizer.compute_loss(self, criterion, output, batch['tag_id'], batch['mask'])

    def decode_output(self, output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                      mask: torch.BoolTensor, batch: Dict[str, Any], decoder, **kwargs) -> Union[Dict[str, Any], Any]:
        return TransformerTaggingTokenizer.decode_output(self, output, mask, batch, decoder)

    def update_metrics(self, batch: Dict[str, Any],
                       output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                       prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):
        TransformerTaggingTokenizer.update_metrics(self, metric, output, batch['tag_id'], None, batch, prediction)

    def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
        return LinearCRFDecoder(encoder_size, len(self.vocabs['tag']), self.config.crf)

    def build_metric(self, **kwargs):
        return TransformerTaggingTokenizer.build_metric(self)

    def build_criterion(self, model=None, **kwargs):
        return TransformerTaggingTokenizer.build_criterion(self, model=model, reduction='mean')

    def input_is_flat(self, data) -> bool:
        return TransformerTaggingTokenizer.input_is_flat(self, data)

    def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> Union[List, Dict]:
        return TransformerTaggingTokenizer.prediction_to_human(self, prediction, None, batch, rebuild_span=True)

    def build_tokenizer(self, tokenizer: TransformerSequenceTokenizer):
        # The transform for tokenizer needs very special settings, ensure these settings are set properly.
        return TransformerSequenceTokenizer(
            tokenizer.tokenizer,
            tokenizer.input_key,
            tokenizer.output_key,
            tokenizer.max_seq_length,
            tokenizer.truncate_long_sequences,
            ret_subtokens=True,
            ret_subtokens_group=True,
            ret_token_span=True,
            cls_is_bos=True,
            sep_is_eos=True,
            use_fast=tokenizer.tokenizer.is_fast,
            dict_force=self.dict_force,
            strip_cls_sep=False,
        )

    def build_samples(self, inputs, cls_is_bos=False, sep_is_eos=False):
        return [{self.config.token_key: sent} for sent in inputs]

    @property
    def dict_force(self) -> DictInterface:
        return TransformerTaggingTokenizer.dict_force.fget(self)

    @dict_force.setter
    def dict_force(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]):
        if dictionary is not None and not isinstance(dictionary, DictInterface):
            dictionary = TrieDict(dictionary)
        self.config.dict_force = dictionary

    @property
    def dict_combine(self) -> DictInterface:
        return TransformerTaggingTokenizer.dict_combine.fget(self)

    @dict_combine.setter
    def dict_combine(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]):
        # noinspection PyArgumentList
        TransformerTaggingTokenizer.dict_combine.fset(self, dictionary)

    def transform_batch(self, batch: Dict[str, Any], results: Dict[str, Any] = None, cls_is_bos=False,
                        sep_is_eos=False) -> Dict[str, Any]:
        """
        This method is overrode to honor the zero indexed token used in custom dict. Although for a tokenizer,
        cls_is_bos = sep_is_eos = True, its tokens don't contain [CLS] or [SEP]. This behaviour is adopted from the
        early versions and it is better kept to avoid migration efforts.


        Args:
            batch: A batch of samples.
            results: Predicted results from other tasks which might be useful for this task to utilize. Say a dep task
                uses both token and pos as features, then it will need both tok and pos results to make a batch.
            cls_is_bos: First token in this batch is BOS.
            sep_is_eos: Last token in this batch is EOS.

        Returns:
            A batch.

        """
        return batch


================================================
FILE: hanlp/components/mtl/tasks/ud.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-17 21:54
import logging
from typing import Dict, Any, List, Union, Iterable, Callable

import torch
from torch.utils.data import DataLoader

from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp_common.document import Document
from hanlp.common.transform import VocabDict, PunctuationMask
from hanlp.components.mtl.tasks import Task
from hanlp_common.conll import CoNLLUWord
from hanlp.components.parsers.ud.ud_model import UniversalDependenciesDecoder
from hanlp.components.parsers.ud.ud_parser import UniversalDependenciesParser
from hanlp.components.parsers.ud.util import generate_lemma_rule, append_bos
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs


class UniversalDependenciesParsing(Task, UniversalDependenciesParser):

    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=None,
                 separate_optimizer=False,
                 cls_is_bos=True,
                 sep_is_eos=False,
                 n_mlp_arc=768,
                 n_mlp_rel=256,
                 mlp_dropout=.33,
                 tree=False,
                 proj=False,
                 punct=False,
                 max_seq_len=None,
                 **kwargs) -> None:
        r"""Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing) implementation
        of "75 Languages, 1 Model: Parsing Universal Dependencies Universally" (:cite:`kondratyuk-straka-2019-75`).

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            cls_is_bos: ``True`` to treat the first token as ``BOS``.
            sep_is_eos: ``True`` to treat the last token as ``EOS``.
            n_mlp_arc: Number of features for arc representation.
            n_mlp_rel: Number of features for rel representation.
            mlp_dropout: Dropout applied to MLPs.
            tree: ``True`` to enforce tree constraint.
            proj: ``True`` for projective parsing.
            punct: ``True`` to include punctuations in evaluation.
            max_seq_len: Prune samples longer than this length. Useful for reducing GPU consumption.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()

    def build_dataloader(self, data, transform: Callable = None, training=False, device=None,
                         logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader:
        _transform = [generate_lemma_rule, append_bos, self.vocabs, transform]
        if isinstance(data, str) and not self.config.punct:
            _transform.append(PunctuationMask('token', 'punct_mask'))
        dataset = UniversalDependenciesParser.build_dataset(self, data, _transform)
        dataset.purge_cache()
        if self.vocabs.mutable:
            UniversalDependenciesParser.build_vocabs(self, dataset, logger, transformer=True)
        max_seq_len = self.config.get('max_seq_len', None)
        if max_seq_len and isinstance(data, str):
            dataset.prune(lambda x: len(x['token_input_ids']) > max_seq_len, logger)
        return PadSequenceDataLoader(
            batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset),
                                                     shuffle=training, gradient_accumulation=gradient_accumulation),
            device=device,
            dataset=dataset,
            pad={'arc': 0})

    def compute_loss(self, batch: Dict[str, Any],
                     output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \
            Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
        return output[0]['loss'] / 4  # we have 4 tasks

    def decode_output(self, output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                      mask: torch.BoolTensor, batch: Dict[str, Any], decoder: torch.nn.Module, **kwargs) -> Union[
        Dict[str, Any], Any]:
        return UniversalDependenciesParser.decode_output(self, *output, batch)

    def update_metrics(self, batch: Dict[str, Any],
                       output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                       prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):
        UniversalDependenciesParser.update_metrics(self, metric, batch, *output)

    # noinspection PyMethodOverriding
    def build_model(self,
                    encoder_size,
                    n_mlp_arc,
                    n_mlp_rel,
                    mlp_dropout,
                    training=True,
                    **kwargs) -> torch.nn.Module:
        return UniversalDependenciesDecoder(
            encoder_size,
            n_mlp_arc,
            n_mlp_rel,
            mlp_dropout,
            len(self.vocabs.rel),
            len(self.vocabs.lemma),
            len(self.vocabs.pos),
            len(self.vocabs.feat),
            0,
            0
        )

    def build_metric(self, **kwargs):
        return UniversalDependenciesParser.build_metric(self)

    def input_is_flat(self, data) -> bool:
        return UniversalDependenciesParser.input_is_flat(self, data)

    def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List:
        yield from UniversalDependenciesParser.prediction_to_human(self, prediction, batch)

    def feed_batch(self, h: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask: torch.BoolTensor,
                   decoder: torch.nn.Module):
        mask = self.compute_mask(batch)
        output_dict = decoder(h, batch, mask)
        if decoder.training:
            mask = mask.clone()
        mask[:, 0] = 0
        return output_dict, mask

    def finalize_document(self, doc: Document, task_name: str):
        lem = []
        pos = []
        feat = []
        dep = []
        for sent in doc[task_name]:
            sent: List[CoNLLUWord] = sent
            lem.append([x.lemma for x in sent])
            pos.append([x.upos for x in sent])
            feat.append([x.feats for x in sent])
            dep.append([(x.head, x.deprel) for x in sent])
        promoted = 0
        if 'lem' not in doc:
            doc['lem'] = lem
            promoted += 1
        if 'pos' not in doc:
            doc['pos'] = pos
            promoted += 1
        if 'feat' not in doc:
            doc['fea'] = feat
            promoted += 1
        if 'dep' not in doc:
            doc['dep'] = dep
            promoted += 1
        if promoted == 4:
            doc.pop(task_name)


================================================
FILE: hanlp/components/ner/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-21 17:22

================================================
FILE: hanlp/components/ner/biaffine_ner/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-21 18:41

================================================
FILE: hanlp/components/ner/biaffine_ner/biaffine_ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-09 18:13
import logging
from typing import Union, List, Callable, Dict, Any

from hanlp_common.constant import IDX
from hanlp.common.structure import History
from hanlp.components.ner.biaffine_ner.biaffine_ner_model import BiaffineNamedEntityRecognitionModel
from hanlp.datasets.ner.loaders.json_ner import JsonNERDataset, unpack_ner
from hanlp.layers.transformers.utils import build_optimizer_scheduler_with_transformer
import torch
from torch.utils.data import DataLoader
from hanlp.common.dataset import PadSequenceDataLoader
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import FieldLength, TransformList
from hanlp.common.vocab import Vocab
from hanlp.layers.embeddings.embedding import Embedding
from hanlp.metrics.f1 import F1
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import merge_locals_kwargs, reorder


class BiaffineNamedEntityRecognizer(TorchComponent):

    def __init__(self, **kwargs) -> None:
        """An implementation of Named Entity Recognition as Dependency Parsing (:cite:`yu-etal-2020-named`). It treats
        every possible span as a candidate of entity and predicts its entity label. Non-entity spans are assigned NULL
        label to be excluded. The label prediction is done with a biaffine layer (:cite:`dozat:17a`). As it makes no
        assumption about the spans, it naturally supports flat NER and nested NER.

        Args:
            **kwargs: Predefined config.
        """
        super().__init__(**kwargs)
        self.model: BiaffineNamedEntityRecognitionModel = None

    def build_optimizer(self,
                        trn,
                        epochs,
                        lr,
                        adam_epsilon,
                        weight_decay,
                        warmup_steps,
                        transformer_lr,
                        **kwargs):
        # noinspection PyProtectedMember
        if self.use_transformer:
            num_training_steps = len(trn) * epochs // self.config.get('gradient_accumulation', 1)
            optimizer, scheduler = build_optimizer_scheduler_with_transformer(self.model,
                                                                              self._get_transformer(),
                                                                              lr, transformer_lr,
                                                                              num_training_steps, warmup_steps,
                                                                              weight_decay, adam_epsilon)
        else:
            optimizer = torch.optim.Adam(self.model.parameters(), self.config.lr)
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer=optimizer,
                mode='max',
                factor=0.5,
                patience=2,
                verbose=True,
            )
        return optimizer, scheduler

    @property
    def use_transformer(self):
        return 'token' not in self.vocabs

    def _get_transformer(self):
        return getattr(self.model_.embed, 'transformer', None)

    def build_criterion(self, **kwargs):
        pass

    # noinspection PyProtectedMember
    def build_metric(self, **kwargs) -> F1:
        return F1()

    def execute_training_loop(self,
                              trn: DataLoader,
                              dev: DataLoader,
                              epochs,
                              criterion,
                              optimizer,
                              metric,
                              save_dir,
                              logger: logging.Logger,
                              devices,
                              gradient_accumulation=1,
                              **kwargs):
        best_epoch, best_metric = 0, -1
        optimizer, scheduler = optimizer
        history = History()
        timer = CountdownTimer(epochs)
        ratio_width = len(f'{len(trn)}/{len(trn)}')
        for epoch in range(1, epochs + 1):
            logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
            self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history,
                                gradient_accumulation=gradient_accumulation,
                                linear_scheduler=scheduler if self._get_transformer() else None)
            if dev:
                self.evaluate_dataloader(dev, criterion, metric, logger, ratio_width=ratio_width)
            report = f'{timer.elapsed_human}/{timer.total_time_human}'
            dev_score = metric.score
            if not self._get_transformer():
                scheduler.step(dev_score)
            if dev_score > best_metric:
                self.save_weights(save_dir)
                best_metric = dev_score
                report += ' [red]saved[/red]'
            timer.log(report, ratio_percentage=False, newline=True, ratio=False)
        return best_metric

    def fit_dataloader(self,
                       trn: DataLoader,
                       criterion,
                       optimizer,
                       metric,
                       logger: logging.Logger,
                       linear_scheduler=None,
                       history: History = None,
                       gradient_accumulation=1,
                       **kwargs):
        self.model.train()
        timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation))
        total_loss = 0
        self.reset_metrics(metric)
        for batch in trn:
            optimizer.zero_grad()
            output_dict = self.feed_batch(batch)
            self.update_metrics(batch, output_dict, metric)
            loss = output_dict['loss']
            if gradient_accumulation and gradient_accumulation > 1:
                loss /= gradient_accumulation
            loss.backward()
            total_loss += loss.item()
            if history.step(gradient_accumulation):
                if self.config.grad_norm:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm)
                optimizer.step()
                if linear_scheduler:
                    linear_scheduler.step()
                timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
                          logger=logger)
            del loss
        return total_loss / timer.total

    # noinspection PyMethodOverriding
    @torch.no_grad()
    def evaluate_dataloader(self,
                            data: DataLoader,
                            criterion: Callable,
                            metric,
                            logger,
                            ratio_width=None,
                            output=False,
                            **kwargs):
        self.model.eval()
        self.reset_metrics(metric)
        timer = CountdownTimer(len(data))
        total_loss = 0
        if output:
            fp = open(output, 'w')
        for batch in data:
            output_dict = self.feed_batch(batch)
            if output:
                for sent, pred, gold in zip(batch['token'], output_dict['prediction'], batch['ner']):
                    fp.write('Tokens\t' + ' '.join(sent) + '\n')
                    fp.write('Pred\t' + '\t'.join(
                        ['[' + ' '.join(sent[x:y + 1]) + f']/{label}' for x, y, label in pred]) + '\n')
                    fp.write('Gold\t' + '\t'.join(
                        ['[' + ' '.join(sent[x:y + 1]) + f']/{label}' for x, y, label in gold]) + '\n')
                    fp.write('\n')
            self.update_metrics(batch, output_dict, metric)
            loss = output_dict['loss']
            total_loss += loss.item()
            timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
                      logger=logger,
                      ratio_width=ratio_width)
            del loss
        if output:
            fp.close()
        return total_loss / timer.total, metric

    def build_model(self,
                    training=True,
                    **kwargs) -> torch.nn.Module:
        # noinspection PyTypeChecker
        # embed: torch.nn.Embedding = self.config.embed.module(vocabs=self.vocabs)[0].embed
        model = BiaffineNamedEntityRecognitionModel(self.config,
                                                    self.config.embed.module(vocabs=self.vocabs),
                                                    self.config.context_layer,
                                                    len(self.vocabs.label))
        return model

    # noinspection PyMethodOverriding
    def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger = None, vocabs=None,
                         sampler_builder=None,
                         gradient_accumulation=1,
                         **kwargs) -> DataLoader:
        if vocabs is None:
            vocabs = self.vocabs
        transform = TransformList(unpack_ner, FieldLength('token'))
        if isinstance(self.config.embed, Embedding):
            transform.append(self.config.embed.transform(vocabs=vocabs))
        transform.append(self.vocabs)
        dataset = self.build_dataset(data, vocabs, transform)
        if vocabs.mutable:
            self.build_vocabs(dataset, logger, vocabs)
        if 'token' in vocabs:
            lens = [x['token'] for x in dataset]
        else:
            lens = [len(x['token_input_ids']) for x in dataset]
        if sampler_builder:
            sampler = sampler_builder.build(lens, shuffle, gradient_accumulation)
        else:
            sampler = None
        return PadSequenceDataLoader(batch_sampler=sampler,
                                     device=device,
                                     dataset=dataset)

    def build_dataset(self, data, vocabs, transform):
        dataset = JsonNERDataset(data, transform=transform,
                                 doc_level_offset=self.config.get('doc_level_offset', True),
                                 tagset=self.config.get('tagset', None))
        dataset.append_transform(vocabs)
        if isinstance(data, str):
            dataset.purge_cache()  # Enable cache
        return dataset

    def predict(self, data: Union[List[str], List[List[str]]], batch_size: int = None, ret_tokens=True, **kwargs):
        if not data:
            return []
        flat = self.input_is_flat(data)
        if flat:
            data = [data]
        dataloader = self.build_dataloader([{'token': x} for x in data], batch_size, False, self.device)
        predictions = []
        orders = []
        for batch in dataloader:
            output_dict = self.feed_batch(batch)
            token = batch['token']
            prediction = output_dict['prediction']
            self.prediction_to_result(token, prediction, predictions, ret_tokens)
            orders.extend(batch[IDX])
        predictions = reorder(predictions, orders)
        if flat:
            return predictions[0]
        return predictions

    @staticmethod
    def prediction_to_result(token, prediction, predictions: List, ret_tokens: Union[bool, str]):
        for tokens, ner in zip(token, prediction):
            prediction_per_sent = []
            for i, (b, e, l) in enumerate(ner):
                if ret_tokens is not None:
                    entity = tokens[b: e + 1]
                    if isinstance(ret_tokens, str):
                        entity = ret_tokens.join(entity)
                    prediction_per_sent.append((entity, l, b, e + 1))
                else:
                    prediction_per_sent.append((b, e + 1, l))
            predictions.append(prediction_per_sent)

    @staticmethod
    def input_is_flat(data):
        return isinstance(data[0], str)

    # noinspection PyMethodOverriding
    def fit(self,
            trn_data,
            dev_data,
            save_dir,
            embed: Embedding,
            context_layer,
            sampler='sorting',
            n_buckets=32,
            batch_size=50,
            lexical_dropout=0.5,
            ffnn_size=150,
            is_flat_ner=True,
            doc_level_offset=True,
            lr=1e-3,
            transformer_lr=1e-5,
            adam_epsilon=1e-6,
            weight_decay=0.01,
            warmup_steps=0.1,
            grad_norm=5.0,
            epochs=50,
            loss_reduction='sum',
            gradient_accumulation=1,
            ret_tokens=True,
            tagset=None,
            sampler_builder=None,
            devices=None,
            logger=None,
            seed=None,
            **kwargs
            ):
        """

        Args:
            trn_data: Path to training set.
            dev_data: Path to dev set.
            save_dir: The directory to save trained component.
            embed: Embeddings to use.
            context_layer: A contextualization layer (transformer or RNN).
            sampler: Sampler to use.
            n_buckets: Number of buckets to use in KMeans sampler.
            batch_size: The number of samples in a batch.
            lexical_dropout: Dropout applied to hidden states of context layer.
            ffnn_size: Feedforward size for MLPs extracting the head/tail representations.
            is_flat_ner: ``True`` for flat NER, otherwise nested NER.
            doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level.
            lr: Learning rate for decoder.
            transformer_lr: Learning rate for encoder.
            adam_epsilon: The epsilon to use in Adam.
            weight_decay: The weight decay to use.
            warmup_steps: The number of warmup steps.
            grad_norm: Gradient norm for clipping.
            epochs: The number of epochs to train.
            loss_reduction: The loss reduction used in aggregating losses.
            gradient_accumulation: Number of mini-batches per update step.
            ret_tokens: A delimiter between tokens in entities so that the surface form of an entity can be rebuilt.
            tagset: Optional tagset to prune entities outside of this tagset from datasets.
            sampler_builder: The builder to build sampler, which will override batch_size.
            devices: Devices this component will live on.
            logger: Any :class:`logging.Logger` instance.
            seed: Random seed to reproduce this training.
            **kwargs: Not used.

        Returns:
            The best metrics on training set.
        """
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def build_vocabs(self, dataset, logger, vocabs, lock=True, label_vocab_name='label', **kwargs):
        vocabs[label_vocab_name] = label_vocab = Vocab(pad_token=None, unk_token=None)
        # Use null to indicate no relationship
        label_vocab.add('<null>')
        timer = CountdownTimer(len(dataset))
        for each in dataset:
            timer.log('Building NER vocab [blink][yellow]...[/yellow][/blink]')
        label_vocab.set_unk_as_safe_unk()
        if lock:
            vocabs.lock()
            vocabs.summary(logger)

    def reset_metrics(self, metrics):
        metrics.reset()

    def report_metrics(self, loss, metrics):
        return f'loss: {loss:.4f} {metrics}'

    def feed_batch(self, batch) -> Dict[str, Any]:
        output_dict = self.model(batch)
        output_dict['prediction'] = self.get_pred_ner(batch['token'], output_dict['candidate_ner_scores'])
        return output_dict

    def update_metrics(self, batch: dict, prediction: Union[Dict, List], metrics):
        if isinstance(prediction, dict):
            prediction = prediction['prediction']
        assert len(prediction) == len(batch['ner'])
        for pred, gold in zip(prediction, batch['ner']):
            metrics(set(pred), set(gold))

    def get_pred_ner(self, sentences, span_scores):
        is_flat_ner = self.config.is_flat_ner
        candidates = []
        for sid, sent in enumerate(sentences):
            for s in range(len(sent)):
                for e in range(s, len(sent)):
                    candidates.append((sid, s, e))

        top_spans = [[] for _ in range(len(sentences))]
        span_scores_cpu = span_scores.tolist()
        for i, type in enumerate(torch.argmax(span_scores, dim=-1).tolist()):
            if type > 0:
                sid, s, e = candidates[i]
                top_spans[sid].append((s, e, type, span_scores_cpu[i][type]))

        top_spans = [sorted(top_span, reverse=True, key=lambda x: x[3]) for top_span in top_spans]
        sent_pred_mentions = [[] for _ in range(len(sentences))]
        for sid, top_span in enumerate(top_spans):
            for ns, ne, t, _ in top_span:
                for ts, te, _ in sent_pred_mentions[sid]:
                    if ns < ts <= ne < te or ts < ns <= te < ne:
                        # for both nested and flat ner no clash is allowed
                        break
                    if is_flat_ner and (ns <= ts <= te <= ne or ts <= ns <= ne <= te):
                        # for flat ner nested mentions are not allowed
                        break
                else:
                    sent_pred_mentions[sid].append((ns, ne, t))
        pred_mentions = set((sid, s, e, t) for sid, spr in enumerate(sent_pred_mentions) for s, e, t in spr)
        prediction = [[] for _ in range(len(sentences))]
        idx_to_label = self.vocabs['label'].idx_to_token
        for sid, s, e, t in sorted(pred_mentions):
            prediction[sid].append((s, e, idx_to_label[t]))
        return prediction


================================================
FILE: hanlp/components/ner/biaffine_ner/biaffine_ner_model.py
================================================
from typing import Dict

import torch
import torch.nn.functional as F
from torch import nn

import hanlp.utils.torch_util
from hanlp.layers.time_distributed import TimeDistributed
from ...parsers.biaffine.biaffine import Biaffine


def initializer_1d(input_tensor, initializer):
    assert len(input_tensor.size()) == 1
    input_tensor = input_tensor.view(-1, 1)
    input_tensor = initializer(input_tensor)
    return input_tensor.view(-1)


class BiaffineNamedEntityRecognitionModel(nn.Module):

    def __init__(self, config, embed: torch.nn.Module, context_layer: torch.nn.Module, label_space_size):
        super(BiaffineNamedEntityRecognitionModel, self).__init__()
        self.config = config
        self.lexical_dropout = float(self.config.lexical_dropout)
        self.label_space_size = label_space_size

        # Initialize layers and parameters
        self.word_embedding_dim = embed.get_output_dim()  # get the embedding dim
        self.embed = embed
        # Initialize context layer
        self.context_layer = context_layer
        context_layer_output_dim = context_layer.get_output_dim()

        self.decoder = BiaffineNamedEntityRecognitionDecoder(context_layer_output_dim, config.ffnn_size,
                                                             label_space_size, config.loss_reduction)

    def forward(self,
                batch: Dict[str, torch.Tensor]
                ):
        keys = 'token_length', 'begin_offset', 'end_offset', 'label_id'
        sent_lengths, gold_starts, gold_ends, gold_labels = [batch.get(k, None) for k in keys]
        masks = hanlp.utils.torch_util.lengths_to_mask(sent_lengths)
        num_sentences, max_sent_length = masks.size()
        raw_embeddings = self.embed(batch, mask=masks)

        raw_embeddings = F.dropout(raw_embeddings, self.lexical_dropout, self.training)

        contextualized_embeddings = self.context_layer(raw_embeddings, masks)
        return self.decoder.decode(contextualized_embeddings, gold_starts, gold_ends, gold_labels, masks,
                                   max_sent_length,
                                   num_sentences, sent_lengths)


class BiaffineNamedEntityRecognitionDecoder(nn.Module):
    def __init__(self, hidden_size, ffnn_size, label_space_size, loss_reduction='sum') -> None:
        """An implementation of the biaffine decoder in "Named Entity Recognition as Dependency Parsing"
        (:cite:`yu-etal-2020-named`).

        Args:
            hidden_size: Size of hidden states.
            ffnn_size: Feedforward size for MLPs extracting the head/tail representations.
            label_space_size: Size of tag set.
            loss_reduction: The loss reduction used in aggregating losses.
        """
        super().__init__()
        self.loss_reduction = loss_reduction

        # MLPs
        def new_mlp():
            return TimeDistributed(nn.Linear(hidden_size, ffnn_size))

        self.start_mlp = new_mlp()
        self.end_mlp = new_mlp()
        self.biaffine = Biaffine(ffnn_size, label_space_size)

    def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None):
        keys = 'token_length', 'begin_offset', 'end_offset', 'label_id'
        sent_lengths, gold_starts, gold_ends, gold_labels = [batch.get(k, None) for k in keys]
        if mask is None:
            mask = hanlp.utils.torch_util.lengths_to_mask(sent_lengths)
        num_sentences, max_sent_length = mask.size()
        return self.decode(contextualized_embeddings, gold_starts, gold_ends, gold_labels, mask,
                           max_sent_length,
                           num_sentences, sent_lengths)

    def get_dense_span_labels(self, span_starts, span_ends, span_labels, max_sentence_length):
        num_sentences, max_spans_num = span_starts.size()

        sentence_indices = torch.arange(0, num_sentences, device=span_starts.device).unsqueeze(1).expand(-1,
                                                                                                         max_spans_num)

        sparse_indices = torch.cat([sentence_indices.unsqueeze(2), span_starts.unsqueeze(2), span_ends.unsqueeze(2)],
                                   dim=2)
        rank = 3
        dense_labels = torch.sparse.LongTensor(sparse_indices.view(num_sentences * max_spans_num, rank).t(),
                                               span_labels.view(-1),
                                               torch.Size([num_sentences] + [max_sentence_length] * (rank - 1))) \
            .to_dense()
        return dense_labels

    def decode(self, contextualized_embeddings, gold_starts, gold_ends, gold_labels, masks, max_sent_length,
               num_sentences, sent_lengths):
        # Apply MLPs to starts and ends, [num_sentences, max_sentences_length,emb]
        candidate_starts_emb = self.start_mlp(contextualized_embeddings)
        candidate_ends_emb = self.end_mlp(contextualized_embeddings)
        candidate_ner_scores = self.biaffine(candidate_starts_emb, candidate_ends_emb).permute([0, 2, 3, 1])

        """generate candidate spans with argument pruning"""
        # Generate masks
        candidate_scores_mask = masks.unsqueeze(1) & masks.unsqueeze(2)
        device = sent_lengths.device
        sentence_ends_leq_starts = (
            ~hanlp.utils.torch_util.lengths_to_mask(torch.arange(max_sent_length, device=device), max_sent_length)) \
            .unsqueeze_(0).expand(num_sentences, -1, -1)
        candidate_scores_mask &= sentence_ends_leq_starts
        candidate_ner_scores = candidate_ner_scores[candidate_scores_mask]
        predict_dict = {
            "candidate_ner_scores": candidate_ner_scores,

        }
        if gold_starts is not None:
            gold_ner_labels = self.get_dense_span_labels(gold_starts, gold_ends, gold_labels, max_sent_length)
            loss = torch.nn.functional.cross_entropy(candidate_ner_scores,
                                                     gold_ner_labels[candidate_scores_mask],
                                                     reduction=self.loss_reduction)
            predict_dict['loss'] = loss
        return predict_dict


================================================
FILE: hanlp/components/ner/ner_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-09-14 20:33
from abc import ABC
from typing import Union, Any, Tuple, Iterable

import tensorflow as tf
from hanlp.components.taggers.transformers.transformer_transform_tf import TransformerTransform

from hanlp.common.transform_tf import Transform

from hanlp.common.keras_component import KerasComponent
from hanlp.components.taggers.ngram_conv.ngram_conv_tagger import NgramConvTaggerTF
from hanlp.components.taggers.rnn_tagger_tf import RNNTaggerTF
from hanlp.components.taggers.transformers.transformer_tagger_tf import TransformerTaggerTF
from hanlp.metrics.chunking.sequence_labeling import iobes_to_span
from hanlp_common.util import merge_locals_kwargs


class IOBES_NamedEntityRecognizer(KerasComponent, ABC):

    def predict_batch(self, batch, inputs=None):
        for words, tags in zip(inputs, super().predict_batch(batch, inputs)):
            yield from iobes_to_span(words, tags)


class IOBES_Transform(Transform):

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None,
                     batch=None) -> Iterable:
        for words, tags in zip(inputs, super().Y_to_outputs(Y, gold, inputs=inputs, X=X, batch=batch)):
            yield from iobes_to_span(words, tags)


class RNNNamedEntityRecognizerTF(RNNTaggerTF, IOBES_NamedEntityRecognizer):

    def fit(self, trn_data: str, dev_data: str = None, save_dir: str = None, embeddings=100, embedding_trainable=False,
            rnn_input_dropout=0.2, rnn_units=100, rnn_output_dropout=0.2, epochs=20, logger=None,
            loss: Union[tf.keras.losses.Loss, str] = None,
            optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='f1', batch_size=32,
            dev_batch_size=32, lr_decay_per_epoch=None,
            run_eagerly=False,
            verbose=True, **kwargs):
        # assert kwargs.get('run_eagerly', True), 'This component can only run eagerly'
        # kwargs['run_eagerly'] = True
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def build_loss(self, loss, **kwargs):
        if not loss:
            loss = tf.keras.losses.SparseCategoricalCrossentropy(
                reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE,
                from_logits=True)
        return super().build_loss(loss, **kwargs)


class NgramConvNamedEntityRecognizerTF(NgramConvTaggerTF, IOBES_NamedEntityRecognizer):

    def fit(self, trn_data: Any, dev_data: Any, save_dir: str, word_embed: Union[str, int, dict] = 200,
            ngram_embed: Union[str, int, dict] = 50, embedding_trainable=True, window_size=4, kernel_size=3,
            filters=(200, 200, 200, 200, 200), dropout_embed=0.2, dropout_hidden=0.2, weight_norm=True,
            loss: Union[tf.keras.losses.Loss, str] = None,
            optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='f1', batch_size=100,
            epochs=100, logger=None, verbose=True, **kwargs):
        return super().fit(trn_data, dev_data, save_dir, word_embed, ngram_embed, embedding_trainable, window_size,
                           kernel_size, filters, dropout_embed, dropout_hidden, weight_norm, loss, optimizer, metrics,
                           batch_size, epochs, logger, verbose, **kwargs)


class IOBES_TransformerTransform(IOBES_Transform, TransformerTransform):
    pass


class TransformerNamedEntityRecognizerTF(TransformerTaggerTF):

    def __init__(self, transform: TransformerTransform = None) -> None:
        if not transform:
            transform = IOBES_TransformerTransform()
        super().__init__(transform)

    def fit(self, trn_data, dev_data, save_dir, transformer, optimizer='adamw', learning_rate=5e-5, weight_decay_rate=0,
            epsilon=1e-8, clipnorm=1.0, warmup_steps_ratio=0, use_amp=False, max_seq_length=128, batch_size=32,
            epochs=3, metrics='f1', run_eagerly=False, logger=None, verbose=True, **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))


================================================
FILE: hanlp/components/ner/rnn_ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-12 18:00
from typing import Any

import torch
from hanlp_common.util import merge_locals_kwargs

import hanlp.utils.span_util
from hanlp.components.taggers.rnn_tagger import RNNTagger
from hanlp.metrics.chunking.conlleval import SpanF1


class RNNNamedEntityRecognizer(RNNTagger):

    def __init__(self, **kwargs) -> None:
        """An old-school RNN tagger using word2vec or fasttext embeddings.

        Args:
            **kwargs: Predefined config.
        """
        super().__init__(**kwargs)

    def build_metric(self, **kwargs):
        return SpanF1(self.tagging_scheme)

    def evaluate_dataloader(self, data, criterion, logger=None, ratio_width=None, **kwargs):
        loss, metric = super().evaluate_dataloader(data, criterion, logger, ratio_width, **kwargs)
        if logger:
            logger.info(metric.result(True, False)[-1])
        return loss, metric

    def fit(self, trn_data, dev_data, save_dir, batch_size=50, epochs=100, embed=100, rnn_input=None, rnn_hidden=256,
            drop=0.5, lr=0.001, patience=10, crf=True, optimizer='adam', token_key='token', tagging_scheme=None,
            anneal_factor: float = 0.5, delimiter=None, anneal_patience=2, devices=None,
            token_delimiter=None,
            logger=None,
            verbose=True, **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def update_metrics(self, metric, logits, y, mask, batch, prediction):
        logits = self.decode_output(logits, mask, batch)
        if isinstance(logits, torch.Tensor):
            logits = logits.tolist()
        metric(self._id_to_tags(logits), batch['tag'])

    def predict(self, tokens: Any, batch_size: int = None, **kwargs):
        return super().predict(tokens, batch_size, **kwargs)

    def predict_data(self, data, batch_size, **kwargs):
        outputs = super().predict_data(data, batch_size)
        tagging_scheme = self.tagging_scheme
        if tagging_scheme == 'IOBES':
            entities = [hanlp.utils.span_util.iobes_tags_to_spans(y) for y in outputs]
        elif tagging_scheme == 'BIO':
            entities = [hanlp.utils.span_util.bio_tags_to_spans(y) for y in outputs]
        elif tagging_scheme == 'BIOUL':
            entities = [hanlp.utils.span_util.bioul_tags_to_spans(y) for y in outputs]
        else:
            raise ValueError(f'Unrecognized tag scheme {tagging_scheme}')
        for i, (tokens, es) in enumerate(zip(data, entities)):
            outputs[i] = [(self.config.token_delimiter.join(tokens[b:e + 1]), t, b, e + 1) for t, (b, e) in es]
        return outputs

    def save_config(self, save_dir, filename='config.json'):
        if self.config.token_delimiter is None:
            self.config.token_delimiter = '' if all(
                [len(x) == 1 for x in self.vocabs[self.config.token_key].idx_to_token[-100:]]) else ' '
        super().save_config(save_dir, filename)


================================================
FILE: hanlp/components/ner/transformer_ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-10-07 11:08
import functools
from typing import Union, List, Dict, Any, Set

from hanlp_trie import DictInterface, TrieDict

from hanlp.common.dataset import SamplerBuilder
from hanlp.components.taggers.transformers.transformer_tagger import TransformerTagger
from hanlp.metrics.chunking.sequence_labeling import get_entities
from hanlp.metrics.f1 import F1
from hanlp.datasets.ner.loaders.json_ner import prune_ner_tagset
from hanlp.utils.string_util import guess_delimiter
from hanlp_common.util import merge_locals_kwargs


class TransformerNamedEntityRecognizer(TransformerTagger):

    def __init__(self, **kwargs) -> None:
        r"""A simple tagger using transformers and a linear layer with an optional CRF
        (:cite:`lafferty2001conditional`) layer for
        NER task. It can utilize whitelist gazetteers which is dict mapping from entity name to entity type.
        During decoding, it performs longest-prefix-matching of these words to override the prediction from
        underlying statistical model. It also uses a blacklist to mask out mis-predicted  entities.

        .. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can
            do and what it can't do. The tutorial in `this book <http://nlp.hankcs.com/book.php>`_ can be very helpful.

        Args:
            **kwargs: Not used.
        """
        super().__init__(**kwargs)

    def build_metric(self, **kwargs):
        return F1()

    # noinspection PyMethodOverriding
    def update_metrics(self, metric, logits, y, mask, batch, prediction):
        for p, g in zip(prediction, self.tag_to_span(batch['tag'], batch)):
            pred = set(p)
            gold = set(g)
            metric(pred, gold)

    # noinspection PyMethodOverriding
    def decode_output(self, logits, mask, batch, model=None):
        output = super().decode_output(logits, mask, batch, model)
        prediction = super().prediction_to_human(output, self.vocabs['tag'].idx_to_token, batch)
        return self.tag_to_span(prediction, batch)

    def tag_to_span(self, batch_tags, batch):
        spans = []
        sents = batch[self.config.token_key]
        dict_whitelist = self.dict_whitelist
        dict_blacklist = self.dict_blacklist
        merge_types = self.config.get('merge_types', None)
        for tags, tokens in zip(batch_tags, sents):
            entities = get_entities(tags)
            if dict_whitelist:
                matches = dict_whitelist.tokenize(tokens)
                if matches:
                    # Fix O E-LOC O like predictions
                    entities = get_entities(tags)
                    for label, start, end in entities:
                        if end - start == 1:
                            tags[start] = 'S-' + label
                        else:
                            tags[start] = 'B-' + label
                            for i in range(start + 1, end - 1):
                                tags[i] = 'I-' + label
                            tags[end - 1] = 'E-' + label
                    for start, end, label in matches:
                        if (not tags[start][0] in 'ME') and (not tags[end - 1][0] in 'BM'):
                            if end - start == 1:
                                tags[start] = 'S-' + label
                            else:
                                tags[start] = 'B-' + label
                                for i in range(start + 1, end - 1):
                                    tags[i] = 'I-' + label
                                tags[end - 1] = 'E-' + label
                    entities = get_entities(tags)
            if merge_types and len(entities) > 1:
                merged_entities = []
                begin = 0
                for i in range(1, len(entities)):
                    if entities[begin][0] != entities[i][0] or entities[i - 1][2] != entities[i][1] \
                            or entities[i][0] not in merge_types:
                        merged_entities.append((entities[begin][0], entities[begin][1], entities[i - 1][2]))
                        begin = i
                merged_entities.append((entities[begin][0], entities[begin][1], entities[-1][2]))
                entities = merged_entities

            if dict_blacklist:
                pruned = []
                delimiter_in_entity = self.config.get('delimiter_in_entity', ' ')
                for label, start, end in entities:
                    entity = delimiter_in_entity.join(tokens[start:end])
                    if entity not in dict_blacklist:
                        pruned.append((label, start, end))
                entities = pruned
            spans.append(entities)
        return spans

    def decorate_spans(self, spans, batch):
        batch_ner = []
        delimiter_in_entity = self.config.get('delimiter_in_entity', ' ')
        for spans_per_sent, tokens in zip(spans, batch.get(f'{self.config.token_key}_', batch[self.config.token_key])):
            ner_per_sent = []
            for label, start, end in spans_per_sent:
                ner_per_sent.append((delimiter_in_entity.join(tokens[start:end]), label, start, end))
            batch_ner.append(ner_per_sent)
        return batch_ner

    def generate_prediction_filename(self, tst_data, save_dir):
        return super().generate_prediction_filename(tst_data.replace('.tsv', '.txt'), save_dir)

    def prediction_to_human(self, pred, vocab, batch):
        return self.decorate_spans(pred, batch)

    def input_is_flat(self, tokens):
        return tokens and isinstance(tokens, list) and isinstance(tokens[0], str)

    def fit(self, trn_data, dev_data, save_dir, transformer,
            delimiter_in_entity=None,
            merge_types: List[str] = None,
            average_subwords=False,
            word_dropout: float = 0.2,
            hidden_dropout=None,
            layer_dropout=0,
            scalar_mix=None,
            grad_norm=5.0,
            lr=5e-5,
            transformer_lr=None,
            adam_epsilon=1e-8,
            weight_decay=0,
            warmup_steps=0.1,
            crf=False,
            secondary_encoder=None,
            reduction='sum',
            batch_size=32,
            sampler_builder: SamplerBuilder = None,
            epochs=3,
            tagset=None,
            token_key='token',
            max_seq_len=None,
            sent_delimiter=None,
            char_level=False,
            hard_constraint=False,
            transform=None,
            logger=None,
            seed=None,
            devices: Union[float, int, List[int]] = None,
            **kwargs):
        """Fit component to training set.

        Args:
            trn_data: Training set.
            dev_data: Development set.
            save_dir: The directory to save trained component.
            transformer: An identifier of a pre-trained transformer.
            delimiter_in_entity: The delimiter between tokens in entity, which is used to rebuild entity by joining
                tokens during decoding.
            merge_types: The types of consecutive entities to be merged.
            average_subwords: ``True`` to average subword representations.
            word_dropout: Dropout rate to randomly replace a subword with MASK.
            hidden_dropout: Dropout rate applied to hidden states.
            layer_dropout: Randomly zero out hidden states of a transformer layer.
            scalar_mix: Layer attention.
            grad_norm: Gradient norm for clipping.
            lr: Learning rate for decoder.
            transformer_lr: Learning for encoder.
            adam_epsilon: The epsilon to use in Adam.
            weight_decay: The weight decay to use.
            warmup_steps: The number of warmup steps.
            crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
            secondary_encoder: An optional secondary encoder to provide enhanced representation by taking the hidden
                states from the main encoder as input.
            reduction: The loss reduction used in aggregating losses.
            batch_size: The number of samples in a batch.
            sampler_builder: The builder to build sampler, which will override batch_size.
            epochs: The number of epochs to train.
            tagset: Optional tagset to prune entities outside of this tagset from datasets.
            token_key: The key to tokens in dataset.
            max_seq_len: The maximum sequence length. Sequence longer than this will be handled by sliding
                window.
            sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
                be split here.
            char_level: Whether the sequence length is measured at char level, which is never the case for
                lemmatization.
            hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
                in a sentence, it will be split at a token anyway.
            transform: An optional transform to be applied to samples. Usually a character normalization transform is
                passed in.
            devices: Devices this component will live on.
            logger: Any :class:`logging.Logger` instance.
            seed: Random seed to reproduce this training.
            **kwargs: Not used.

        Returns:
            The best metrics on training set.
        """
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def build_vocabs(self, trn, logger, **kwargs):
        super().build_vocabs(trn, logger, **kwargs)
        if self.config.get('delimiter_in_entity', None) is None:
            # Check the first sample to guess the delimiter between tokens in a NE
            tokens = trn[0][self.config.token_key]
            delimiter_in_entity = guess_delimiter(tokens)
            logger.info(f'Guess the delimiter between tokens in named entity could be [blue]"{delimiter_in_entity}'
                        f'"[/blue]. If not, specify `delimiter_in_entity` in `fit()`')
            self.config.delimiter_in_entity = delimiter_in_entity

    def build_dataset(self, data, transform=None, **kwargs):
        dataset = super().build_dataset(data, transform, **kwargs)
        if isinstance(data, str):
            tagset = self.config.get('tagset', None)
            if tagset:
                dataset.append_transform(functools.partial(prune_ner_tagset, tagset=tagset))
        return dataset

    @property
    def dict_whitelist(self) -> DictInterface:
        return self.config.get('dict_whitelist', None)

    @dict_whitelist.setter
    def dict_whitelist(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]):
        if dictionary is not None and not isinstance(dictionary, DictInterface):
            dictionary = TrieDict(dictionary)
        self.config.dict_whitelist = dictionary

    @property
    def dict_blacklist(self) -> DictInterface:
        return self.config.get('dict_blacklist', None)

    @dict_blacklist.setter
    def dict_blacklist(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]):
        if dictionary is not None and not isinstance(dictionary, DictInterface):
            dictionary = TrieDict(dictionary)
        self.config.dict_blacklist = dictionary


================================================
FILE: hanlp/components/parsers/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-22 12:46

================================================
FILE: hanlp/components/parsers/alg.py
================================================
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


import torch

from hanlp_common.conll import isprojective


def kmeans(x, k, max_it=32):
    r"""
    KMeans algorithm for clustering the sentences by length.

    Args:
        x (list[int]):
            The list of sentence lengths.
        k (int):
            The number of clusters.
            This is an approximate value. The final number of clusters can be less or equal to `k`.
        max_it (int):
            Maximum number of iterations.
            If centroids does not converge after several iterations, the algorithm will be early stopped.

    Returns:
        list[float], list[list[int]]:
            The first list contains average lengths of sentences in each cluster.
            The second is the list of clusters holding indices of data points.

    Examples:
        >>> x = torch.randint(10,20,(10,)).tolist()
        >>> x
        [15, 10, 17, 11, 18, 13, 17, 19, 18, 14]
        >>> centroids, clusters = kmeans(x, 3)
        >>> centroids
        [10.5, 14.0, 17.799999237060547]
        >>> clusters
        [[1, 3], [0, 5, 9], [2, 4, 6, 7, 8]]
    """

    # the number of clusters must not be greater than the number of datapoints
    x, k = torch.tensor(x, dtype=torch.float), min(len(x), k)
    # collect unique datapoints
    d = x.unique()
    # initialize k centroids randomly
    c = d[torch.randperm(len(d))[:k]]
    # assign each datapoint to the cluster with the closest centroid
    dists, y = torch.abs_(x.unsqueeze(-1) - c).min(-1)

    for _ in range(max_it):
        # if an empty cluster is encountered,
        # choose the farthest datapoint from the biggest cluster and move that the empty one
        mask = torch.arange(k).unsqueeze(-1).eq(y)
        none = torch.where(~mask.any(-1))[0].tolist()
        while len(none) > 0:
            for i in none:
                # the biggest cluster
                b = torch.where(mask[mask.sum(-1).argmax()])[0]
                # the datapoint farthest from the centroid of cluster b
                f = dists[b].argmax()
                # update the assigned cluster of f
                y[b[f]] = i
                # re-calculate the mask
                mask = torch.arange(k).unsqueeze(-1).eq(y)
            none = torch.where(~mask.any(-1))[0].tolist()
        # update the centroids
        c, old = (x * mask).sum(-1) / mask.sum(-1), c
        # re-assign all datapoints to clusters
        dists, y = torch.abs_(x.unsqueeze(-1) - c).min(-1)
        # stop iteration early if the centroids converge
        if c.equal(old):
            break
    # assign all datapoints to the new-generated clusters
    # the empty ones are discarded
    assigned = y.unique().tolist()
    # get the centroids of the assigned clusters
    centroids = c[assigned].tolist()
    # map all values of datapoints to buckets
    clusters = [torch.where(y.eq(i))[0].tolist() for i in assigned]

    return centroids, clusters


def eisner(scores, mask):
    r"""
    First-order Eisner algorithm for projective decoding.

    References:
        - Ryan McDonald, Koby Crammer and Fernando Pereira. 2005.
          `Online Large-Margin Training of Dependency Parsers`_.

    Args:
        scores (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
            Scores of all dependent-head pairs.
        mask (~torch.BoolTensor): ``[batch_size, seq_len]``.
            The mask to avoid parsing over padding tokens.
            The first column serving as pseudo words for roots should be ``False``.

    Returns:
        ~torch.Tensor:
            A tensor with shape ``[batch_size, seq_len]`` for the resulting projective parse trees.

    Examples:
        >>> scores = torch.tensor([[[-13.5026, -18.3700, -13.0033, -16.6809],
                                    [-36.5235, -28.6344, -28.4696, -31.6750],
                                    [ -2.9084,  -7.4825,  -1.4861,  -6.8709],
                                    [-29.4880, -27.6905, -26.1498, -27.0233]]])
        >>> mask = torch.tensor([[False,  True,  True,  True]])
        >>> eisner(scores, mask)
        tensor([[0, 2, 0, 2]])

    .. _Online Large-Margin Training of Dependency Parsers:
        https://www.aclweb.org/anthology/P05-1012/
    """

    lens = mask.sum(1)
    batch_size, seq_len, _ = scores.shape
    scores = scores.permute(2, 1, 0)
    s_i = torch.full_like(scores, float('-inf'))
    s_c = torch.full_like(scores, float('-inf'))
    p_i = scores.new_zeros(seq_len, seq_len, batch_size).long()
    p_c = scores.new_zeros(seq_len, seq_len, batch_size).long()
    s_c.diagonal().fill_(0)

    for w in range(1, seq_len):
        n = seq_len - w
        starts = p_i.new_tensor(range(n)).unsqueeze(0)
        # ilr = C(i->r) + C(j->r+1)
        ilr = stripe(s_c, n, w) + stripe(s_c, n, w, (w, 1))
        # [batch_size, n, w]
        il = ir = ilr.permute(2, 0, 1)
        # I(j->i) = max(C(i->r) + C(j->r+1) + s(j->i)), i <= r < j
        il_span, il_path = il.max(-1)
        s_i.diagonal(-w).copy_(il_span + scores.diagonal(-w))
        p_i.diagonal(-w).copy_(il_path + starts)
        # I(i->j) = max(C(i->r) + C(j->r+1) + s(i->j)), i <= r < j
        ir_span, ir_path = ir.max(-1)
        s_i.diagonal(w).copy_(ir_span + scores.diagonal(w))
        p_i.diagonal(w).copy_(ir_path + starts)

        # C(j->i) = max(C(r->i) + I(j->r)), i <= r < j
        cl = stripe(s_c, n, w, (0, 0), 0) + stripe(s_i, n, w, (w, 0))
        cl_span, cl_path = cl.permute(2, 0, 1).max(-1)
        s_c.diagonal(-w).copy_(cl_span)
        p_c.diagonal(-w).copy_(cl_path + starts)
        # C(i->j) = max(I(i->r) + C(r->j)), i < r <= j
        cr = stripe(s_i, n, w, (0, 1)) + stripe(s_c, n, w, (1, w), 0)
        cr_span, cr_path = cr.permute(2, 0, 1).max(-1)
        s_c.diagonal(w).copy_(cr_span)
        s_c[0, w][lens.ne(w)] = float('-inf')
        p_c.diagonal(w).copy_(cr_path + starts + 1)

    def backtrack(p_i, p_c, heads, i, j, complete):
        if i == j:
            return
        if complete:
            r = p_c[i, j]
            backtrack(p_i, p_c, heads, i, r, False)
            backtrack(p_i, p_c, heads, r, j, True)
        else:
            r, heads[j] = p_i[i, j], i
            i, j = sorted((i, j))
            backtrack(p_i, p_c, heads, i, r, True)
            backtrack(p_i, p_c, heads, j, r + 1, True)

    preds = []
    p_c = p_c.permute(2, 0, 1).cpu()
    p_i = p_i.permute(2, 0, 1).cpu()
    for i, length in enumerate(lens.tolist()):
        heads = p_c.new_zeros(length + 1, dtype=torch.long)
        backtrack(p_i[i], p_c[i], heads, 0, length, True)
        preds.append(heads.to(mask.device))

    return pad(preds, total_length=seq_len).to(mask.device)


def backtrack(p_i, p_c, heads, i, j, complete):
    if i == j:
        return
    if complete:
        r = p_c[i, j]
        backtrack(p_i, p_c, heads, i, r, False)
        backtrack(p_i, p_c, heads, r, j, True)
    else:
        r, heads[j] = p_i[i, j], i
        i, j = sorted((i, j))
        backtrack(p_i, p_c, heads, i, r, True)
        backtrack(p_i, p_c, heads, j, r + 1, True)


def stripe(x, n, w, offset=(0, 0), dim=1):
    """r'''Returns a diagonal stripe of the tensor.

    Args:
      x: Tensor
      n: int
      w: int
      offset: tuple (Default value = (0)
      dim: int (Default value = 1)
      Example: 
      0): 

    Returns:

    >>> x = torch.arange(25).view(5, 5)
    >>> x
    tensor([[ 0,  1,  2,  3,  4],
            [ 5,  6,  7,  8,  9],
            [10, 11, 12, 13, 14],
            [15, 16, 17, 18, 19],
            [20, 21, 22, 23, 24]])
    >>> stripe(x, 2, 3, (1, 1))
    tensor([[ 6,  7,  8],
            [12, 13, 14]])
    >>> stripe(x, 2, 3, dim=0)
    tensor([[ 0,  5, 10],
            [ 6, 11, 16]])
    """
    x, seq_len = x.contiguous(), x.size(1)
    stride, numel = list(x.stride()), x[0, 0].numel()
    stride[0] = (seq_len + 1) * numel
    stride[1] = (1 if dim == 1 else seq_len) * numel
    return x.as_strided(size=(n, w, *x.shape[2:]),
                        stride=stride,
                        storage_offset=(offset[0] * seq_len + offset[1]) * numel)


def cky(scores, mask):
    r"""
    The implementation of `Cocke-Kasami-Younger`_ (CKY) algorithm to parse constituency trees.

    References:
        - Yu Zhang, Houquan Zhou and Zhenghua Li. 2020.
          `Fast and Accurate Neural CRF Constituency Parsing`_.

    Args:
        scores (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
            Scores of all candidate constituents.
        mask (~torch.BoolTensor): ``[batch_size, seq_len, seq_len]``.
            The mask to avoid parsing over padding tokens.
            For each square matrix in a batch, the positions except upper triangular part should be masked out.

    Returns:
        Sequences of factorized predicted bracketed trees that are traversed in pre-order.

    Examples:
        >>> scores = torch.tensor([[[ 2.5659,  1.4253, -2.5272,  3.3011],
                                    [ 1.3687, -0.5869,  1.0011,  3.3020],
                                    [ 1.2297,  0.4862,  1.1975,  2.5387],
                                    [-0.0511, -1.2541, -0.7577,  0.2659]]])
        >>> mask = torch.tensor([[[False,  True,  True,  True],
                                  [False, False,  True,  True],
                                  [False, False, False,  True],
                                  [False, False, False, False]]])
        >>> cky(scores, mask)
        [[(0, 3), (0, 1), (1, 3), (1, 2), (2, 3)]]

    .. _Cocke-Kasami-Younger:
        https://en.wikipedia.org/wiki/CYK_algorithm
    .. _Fast and Accurate Neural CRF Constituency Parsing:
        https://www.ijcai.org/Proceedings/2020/560/
    """

    lens = mask[:, 0].sum(-1)
    scores = scores.permute(1, 2, 0)
    seq_len, seq_len, batch_size = scores.shape
    s = scores.new_zeros(seq_len, seq_len, batch_size)
    p = scores.new_zeros(seq_len, seq_len, batch_size).long()

    for w in range(1, seq_len):
        n = seq_len - w
        starts = p.new_tensor(range(n)).unsqueeze(0)

        if w == 1:
            s.diagonal(w).copy_(scores.diagonal(w))
            continue
        # [n, w, batch_size]
        s_span = stripe(s, n, w - 1, (0, 1)) + stripe(s, n, w - 1, (1, w), 0)
        # [batch_size, n, w]
        s_span = s_span.permute(2, 0, 1)
        # [batch_size, n]
        s_span, p_span = s_span.max(-1)
        s.diagonal(w).copy_(s_span + scores.diagonal(w))
        p.diagonal(w).copy_(p_span + starts + 1)

    def backtrack(p, i, j):
        if j == i + 1:
            return [(i, j)]
        split = p[i][j]
        ltree = backtrack(p, i, split)
        rtree = backtrack(p, split, j)
        return [(i, j)] + ltree + rtree

    p = p.permute(2, 0, 1).tolist()
    trees = [backtrack(p[i], 0, length) if length else [] for i, length in enumerate(lens.tolist())]

    return trees


def istree(sequence, proj=False, multiroot=False):
    r"""
    Checks if the arcs form an valid dependency tree.

    Args:
        sequence (list[int]):
            A list of head indices.
        proj (bool):
            If ``True``, requires the tree to be projective. Default: ``False``.
        multiroot (bool):
            If ``False``, requires the tree to contain only a single root. Default: ``True``.

    Returns:
        ``True`` if the arcs form an valid tree, ``False`` otherwise.

    Examples:
        >>> istree([3, 0, 0, 3], multiroot=True)
        True
        >>> istree([3, 0, 0, 3], proj=True)
        False
    """

    if proj and not isprojective(sequence):
        return False
    n_roots = sum(head == 0 for head in sequence)
    if n_roots == 0:
        return False
    if not multiroot and n_roots > 1:
        return False
    if any(i == head for i, head in enumerate(sequence, 1)):
        return False
    return next(tarjan(sequence), None) is None


def tarjan(sequence):
    r"""
    Tarjan algorithm for finding Strongly Connected Components (SCCs) of a graph.

    Args:
        sequence (list):
            List of head indices.

    Yields:
        A list of indices that make up a SCC. All self-loops are ignored.

    Examples:
        >>> next(tarjan([2, 5, 0, 3, 1]))  # (1 -> 5 -> 2 -> 1) is a cycle
        [2, 5, 1]
    """

    sequence = [-1] + sequence
    # record the search order, i.e., the timestep
    dfn = [-1] * len(sequence)
    # record the the smallest timestep in a SCC
    low = [-1] * len(sequence)
    # push the visited into the stack
    stack, onstack = [], [False] * len(sequence)

    def connect(i, timestep):
        dfn[i] = low[i] = timestep[0]
        timestep[0] += 1
        stack.append(i)
        onstack[i] = True

        for j, head in enumerate(sequence):
            if head != i:
                continue
            if dfn[j] == -1:
                yield from connect(j, timestep)
                low[i] = min(low[i], low[j])
            elif onstack[j]:
                low[i] = min(low[i], dfn[j])

        # a SCC is completed
        if low[i] == dfn[i]:
            cycle = [stack.pop()]
            while cycle[-1] != i:
                onstack[cycle[-1]] = False
                cycle.append(stack.pop())
            onstack[i] = False
            # ignore the self-loop
            if len(cycle) > 1:
                yield cycle

    timestep = [0]
    for i in range(len(sequence)):
        if dfn[i] == -1:
            yield from connect(i, timestep)


def chuliu_edmonds(s):
    r"""
    ChuLiu/Edmonds algorithm for non-projective decoding.

    Some code is borrowed from `tdozat's implementation`_.
    Descriptions of notations and formulas can be found in
    `Non-projective Dependency Parsing using Spanning Tree Algorithms`_.

    Notes:
        The algorithm does not guarantee to parse a single-root tree.

    References:
        - Ryan McDonald, Fernando Pereira, Kiril Ribarov and Jan Hajic. 2005.
          `Non-projective Dependency Parsing using Spanning Tree Algorithms`_.

    Args:
        s (~torch.Tensor): ``[seq_len, seq_len]``.
            Scores of all dependent-head pairs.

    Returns:
        ~torch.Tensor:
            A tensor with shape ``[seq_len]`` for the resulting non-projective parse tree.

    .. _tdozat's implementation:
        https://github.com/tdozat/Parser-v3
    .. _Non-projective Dependency Parsing using Spanning Tree Algorithms:
        https://www.aclweb.org/anthology/H05-1066/
    """

    s[0, 1:] = float('-inf')
    # prevent self-loops
    s.diagonal()[1:].fill_(float('-inf'))
    # select heads with highest scores
    tree = s.argmax(-1)
    # return the cycle finded by tarjan algorithm lazily
    cycle = next(tarjan(tree.tolist()[1:]), None)
    # if the tree has no cycles, then it is a MST
    if not cycle:
        return tree
    # indices of cycle in the original tree
    cycle = torch.tensor(cycle)
    # indices of noncycle in the original tree
    noncycle = torch.ones(len(s)).index_fill_(0, cycle, 0)
    noncycle = torch.where(noncycle.gt(0))[0]

    def contract(s):
        # heads of cycle in original tree
        cycle_heads = tree[cycle]
        # scores of cycle in original tree
        s_cycle = s[cycle, cycle_heads]

        # calculate the scores of cycle's potential dependents
        # s(c->x) = max(s(x'->x)), x in noncycle and x' in cycle
        s_dep = s[noncycle][:, cycle]
        # find the best cycle head for each noncycle dependent
        deps = s_dep.argmax(1)
        # calculate the scores of cycle's potential heads
        # s(x->c) = max(s(x'->x) - s(a(x')->x') + s(cycle)), x in noncycle and x' in cycle
        #                                                    a(v) is the predecessor of v in cycle
        #                                                    s(cycle) = sum(s(a(v)->v))
        s_head = s[cycle][:, noncycle] - s_cycle.view(-1, 1) + s_cycle.sum()
        # find the best noncycle head for each cycle dependent
        heads = s_head.argmax(0)

        contracted = torch.cat((noncycle, torch.tensor([-1])))
        # calculate the scores of contracted graph
        s = s[contracted][:, contracted]
        # set the contracted graph scores of cycle's potential dependents
        s[:-1, -1] = s_dep[range(len(deps)), deps]
        # set the contracted graph scores of cycle's potential heads
        s[-1, :-1] = s_head[heads, range(len(heads))]

        return s, heads, deps

    # keep track of the endpoints of the edges into and out of cycle for reconstruction later
    s, heads, deps = contract(s)

    # y is the contracted tree
    y = chuliu_edmonds(s)
    # exclude head of cycle from y
    y, cycle_head = y[:-1], y[-1]

    # fix the subtree with no heads coming from the cycle
    # len(y) denotes heads coming from the cycle
    subtree = y < len(y)
    # add the nodes to the new tree
    tree[noncycle[subtree]] = noncycle[y[subtree]]
    # fix the subtree with heads coming from the cycle
    subtree = ~subtree
    # add the nodes to the tree
    tree[noncycle[subtree]] = cycle[deps[subtree]]
    # fix the root of the cycle
    cycle_root = heads[cycle_head]
    # break the cycle and add the root of the cycle to the tree
    tree[cycle[cycle_root]] = noncycle[cycle_head]

    return tree


def mst(scores, mask, multiroot=False):
    r"""
    MST algorithm for decoding non-pojective trees.
    This is a wrapper for ChuLiu/Edmonds algorithm.

    The algorithm first runs ChuLiu/Edmonds to parse a tree and then have a check of multi-roots,
    If ``multiroot=True`` and there indeed exist multi-roots, the algorithm seeks to find
    best single-root trees by iterating all possible single-root trees parsed by ChuLiu/Edmonds.
    Otherwise the resulting trees are directly taken as the final outputs.

    Args:
        scores (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
            Scores of all dependent-head pairs.
        mask (~torch.BoolTensor): ``[batch_size, seq_len]``.
            The mask to avoid parsing over padding tokens.
            The first column serving as pseudo words for roots should be ``False``.
        muliroot (bool):
            Ensures to parse a single-root tree If ``False``.

    Returns:
        ~torch.Tensor:
            A tensor with shape ``[batch_size, seq_len]`` for the resulting non-projective parse trees.

    Examples:
        >>> scores = torch.tensor([[[-11.9436, -13.1464,  -6.4789, -13.8917],
                                    [-60.6957, -60.2866, -48.6457, -63.8125],
                                    [-38.1747, -49.9296, -45.2733, -49.5571],
                                    [-19.7504, -23.9066,  -9.9139, -16.2088]]])
        >>> scores[:, 0, 1:] = float('-inf')
        >>> scores.diagonal(0, 1, 2)[1:].fill_(float('-inf'))
        >>> mask = torch.tensor([[False,  True,  True,  True]])
        >>> mst(scores, mask)
        tensor([[0, 2, 0, 2]])
    """

    batch_size, seq_len, _ = scores.shape
    scores = scores.detach().cpu().unbind()

    preds = []
    for i, length in enumerate(mask.sum(1).tolist()):
        s = scores[i][:length + 1, :length + 1]
        tree = chuliu_edmonds(s)
        roots = torch.where(tree[1:].eq(0))[0] + 1
        if not multiroot and len(roots) > 1:
            s_root = s[:, 0]
            s_best = float('-inf')
            s = s.index_fill(1, torch.tensor(0), float('-inf'))
            for root in roots:
                s[:, 0] = float('-inf')
                s[root, 0] = s_root[root]
                t = chuliu_edmonds(s)
                s_tree = s[1:].gather(1, t[1:].unsqueeze(-1)).sum()
                if s_tree > s_best:
                    s_best, tree = s_tree, t
        preds.append(tree)

    return pad(preds, total_length=seq_len).to(mask.device)


def eisner2o(scores, mask):
    r"""
    Second-order Eisner algorithm for projective decoding.
    This is an extension of the first-order one that further incorporates sibling scores into tree scoring.

    References:
        - Ryan McDonald and Fernando Pereira. 2006.
          `Online Learning of Approximate Dependency Parsing Algorithms`_.

    Args:
        scores (~torch.Tensor, ~torch.Tensor):
            A tuple of two tensors representing the first-order and second-order scores repectively.
            The first (``[batch_size, seq_len, seq_len]``) holds scores of all dependent-head pairs.
            The second (``[batch_size, seq_len, seq_len, seq_len]``) holds scores of all dependent-head-sibling triples.
        mask (~torch.BoolTensor): ``[batch_size, seq_len]``.
            The mask to avoid parsing over padding tokens.
            The first column serving as pseudo words for roots should be ``False``.

    Returns:
        ~torch.Tensor:
            A tensor with shape ``[batch_size, seq_len]`` for the resulting projective parse trees.

    Examples:
        >>> s_arc = torch.tensor([[[ -2.8092,  -7.9104,  -0.9414,  -5.4360],
                                   [-10.3494,  -7.9298,  -3.6929,  -7.3985],
                                   [  1.1815,  -3.8291,   2.3166,  -2.7183],
                                   [ -3.9776,  -3.9063,  -1.6762,  -3.1861]]])
        >>> s_sib = torch.tensor([[[[ 0.4719,  0.4154,  1.1333,  0.6946],
                                    [ 1.1252,  1.3043,  2.1128,  1.4621],
                                    [ 0.5974,  0.5635,  1.0115,  0.7550],
                                    [ 1.1174,  1.3794,  2.2567,  1.4043]],
                                   [[-2.1480, -4.1830, -2.5519, -1.8020],
                                    [-1.2496, -1.7859, -0.0665, -0.4938],
                                    [-2.6171, -4.0142, -2.9428, -2.2121],
                                    [-0.5166, -1.0925,  0.5190,  0.1371]],
                                   [[ 0.5827, -1.2499, -0.0648, -0.0497],
                                    [ 1.4695,  0.3522,  1.5614,  1.0236],
                                    [ 0.4647, -0.7996, -0.3801,  0.0046],
                                    [ 1.5611,  0.3875,  1.8285,  1.0766]],
                                   [[-1.3053, -2.9423, -1.5779, -1.2142],
                                    [-0.1908, -0.9699,  0.3085,  0.1061],
                                    [-1.6783, -2.8199, -1.8853, -1.5653],
                                    [ 0.3629, -0.3488,  0.9011,  0.5674]]]])
        >>> mask = torch.tensor([[False,  True,  True,  True]])
        >>> eisner2o((s_arc, s_sib), mask)
        tensor([[0, 2, 0, 2]])

    .. _Online Learning of Approximate Dependency Parsing Algorithms:
        https://www.aclweb.org/anthology/E06-1011/
    """

    # the end position of each sentence in a batch
    lens = mask.sum(1)
    s_arc, s_sib = scores
    batch_size, seq_len, _ = s_arc.shape
    # [seq_len, seq_len, batch_size]
    s_arc = s_arc.permute(2, 1, 0)
    # [seq_len, seq_len, seq_len, batch_size]
    s_sib = s_sib.permute(2, 1, 3, 0)
    s_i = torch.full_like(s_arc, float('-inf'))
    s_s = torch.full_like(s_arc, float('-inf'))
    s_c = torch.full_like(s_arc, float('-inf'))
    p_i = s_arc.new_zeros(seq_len, seq_len, batch_size).long()
    p_s = s_arc.new_zeros(seq_len, seq_len, batch_size).long()
    p_c = s_arc.new_zeros(seq_len, seq_len, batch_size).long()
    s_c.diagonal().fill_(0)

    for w in range(1, seq_len):
        # n denotes the number of spans to iterate,
        # from span (0, w) to span (n, n+w) given width w
        n = seq_len - w
        starts = p_i.new_tensor(range(n)).unsqueeze(0)
        # I(j->i) = max(I(j->r) + S(j->r, i)), i < r < j |
        #               C(j->j) + C(i->j-1))
        #           + s(j->i)
        # [n, w, batch_size]
        il = stripe(s_i, n, w, (w, 1)) + stripe(s_s, n, w, (1, 0), 0)
        il += stripe(s_sib[range(w, n + w), range(n)], n, w, (0, 1))
        # [n, 1, batch_size]
        il0 = stripe(s_c, n, 1, (w, w)) + stripe(s_c, n, 1, (0, w - 1))
        # il0[0] are set to zeros since the scores of the complete spans starting from 0 are always -inf
        il[:, -1] = il0.index_fill_(0, lens.new_tensor(0), 0).squeeze(1)
        il_span, il_path = il.permute(2, 0, 1).max(-1)
        s_i.diagonal(-w).copy_(il_span + s_arc.diagonal(-w))
        p_i.diagonal(-w).copy_(il_path + starts + 1)
        # I(i->j) = max(I(i->r) + S(i->r, j), i < r < j |
        #               C(i->i) + C(j->i+1))
        #           + s(i->j)
        # [n, w, batch_size]
        ir = stripe(s_i, n, w) + stripe(s_s, n, w, (0, w), 0)
        ir += stripe(s_sib[range(n), range(w, n + w)], n, w)
        ir[0] = float('-inf')
        # [n, 1, batch_size]
        ir0 = stripe(s_c, n, 1) + stripe(s_c, n, 1, (w, 1))
        ir[:, 0] = ir0.squeeze(1)
        ir_span, ir_path = ir.permute(2, 0, 1).max(-1)
        s_i.diagonal(w).copy_(ir_span + s_arc.diagonal(w))
        p_i.diagonal(w).copy_(ir_path + starts)

        # [n, w, batch_size]
        slr = stripe(s_c, n, w) + stripe(s_c, n, w, (w, 1))
        slr_span, slr_path = slr.permute(2, 0, 1).max(-1)
        # S(j, i) = max(C(i->r) + C(j->r+1)), i <= r < j
        s_s.diagonal(-w).copy_(slr_span)
        p_s.diagonal(-w).copy_(slr_path + starts)
        # S(i, j) = max(C(i->r) + C(j->r+1)), i <= r < j
        s_s.diagonal(w).copy_(slr_span)
        p_s.diagonal(w).copy_(slr_path + starts)

        # C(j->i) = max(C(r->i) + I(j->r)), i <= r < j
        cl = stripe(s_c, n, w, (0, 0), 0) + stripe(s_i, n, w, (w, 0))
        cl_span, cl_path = cl.permute(2, 0, 1).max(-1)
        s_c.diagonal(-w).copy_(cl_span)
        p_c.diagonal(-w).copy_(cl_path + starts)
        # C(i->j) = max(I(i->r) + C(r->j)), i < r <= j
        cr = stripe(s_i, n, w, (0, 1)) + stripe(s_c, n, w, (1, w), 0)
        cr_span, cr_path = cr.permute(2, 0, 1).max(-1)
        s_c.diagonal(w).copy_(cr_span)
        # disable multi words to modify the root
        s_c[0, w][lens.ne(w)] = float('-inf')
        p_c.diagonal(w).copy_(cr_path + starts + 1)

    def backtrack(p_i, p_s, p_c, heads, i, j, flag):
        if i == j:
            return
        if flag == 'c':
            r = p_c[i, j]
            backtrack(p_i, p_s, p_c, heads, i, r, 'i')
            backtrack(p_i, p_s, p_c, heads, r, j, 'c')
        elif flag == 's':
            r = p_s[i, j]
            i, j = sorted((i, j))
            backtrack(p_i, p_s, p_c, heads, i, r, 'c')
            backtrack(p_i, p_s, p_c, heads, j, r + 1, 'c')
        elif flag == 'i':
            r, heads[j] = p_i[i, j], i
            if r == i:
                r = i + 1 if i < j else i - 1
                backtrack(p_i, p_s, p_c, heads, j, r, 'c')
            else:
                backtrack(p_i, p_s, p_c, heads, i, r, 'i')
                backtrack(p_i, p_s, p_c, heads, r, j, 's')

    preds = []
    p_i = p_i.permute(2, 0, 1).cpu()
    p_s = p_s.permute(2, 0, 1).cpu()
    p_c = p_c.permute(2, 0, 1).cpu()
    for i, length in enumerate(lens.tolist()):
        heads = p_c.new_zeros(length + 1, dtype=torch.long)
        backtrack(p_i[i], p_s[i], p_c[i], heads, 0, length, 'c')
        preds.append(heads.to(mask.device))

    return pad(preds, total_length=seq_len).to(mask.device)


def pad(tensors, padding_value=0, total_length=None):
    size = [len(tensors)] + [max(tensor.size(i) for tensor in tensors)
                             for i in range(len(tensors[0].size()))]
    if total_length is not None:
        assert total_length >= size[1]
        size[1] = total_length
    out_tensor = tensors[0].data.new(*size).fill_(padding_value)
    for i, tensor in enumerate(tensors):
        out_tensor[i][[slice(0, i) for i in tensor.size()]] = tensor
    return out_tensor


def decode_dep(s_arc, mask, tree=False, proj=False):
    r"""
    Args:
        s_arc (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
            Scores of all possible arcs.
        mask (~torch.BoolTensor): ``[batch_size, seq_len]``.
            The mask for covering the unpadded tokens.
        tree (bool):
            If ``True``, ensures to output well-formed trees. Default: ``False``.
        proj (bool):
            If ``True``, ensures to output projective trees. Default: ``False``.

    Returns:
        ~torch.Tensor, ~torch.Tensor:
            Predicted arcs and labels of shape ``[batch_size, seq_len]``.
    """

    lens = mask.sum(1)
    arc_preds = s_arc.argmax(-1)
    bad = [not istree(seq[1:i + 1], proj) for i, seq in zip(lens.tolist(), arc_preds.tolist())]
    if tree and any(bad):
        if proj:
            alg = eisner
        else:
            alg = mst
            s_arc.diagonal(0, 1, 2)[1:].fill_(float('-inf'))
        arc_preds[bad] = alg(s_arc[bad], mask[bad])

    return arc_preds


================================================
FILE: hanlp/components/parsers/alg_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-26 19:49
# Ported from the PyTorch implementation https://github.com/zysite/biaffine-parser
from typing import List
import numpy as np
import tensorflow as tf
from collections import defaultdict


def nonzero(t: tf.Tensor) -> tf.Tensor:
    return tf.where(t > 0)


def view(t: tf.Tensor, *dims) -> tf.Tensor:
    return tf.reshape(t, dims)


def arange(n: int) -> tf.Tensor:
    return tf.range(n)


def randperm(n: int) -> tf.Tensor:
    return tf.random.shuffle(arange(n))


def tolist(t: tf.Tensor) -> List:
    if isinstance(t, tf.Tensor):
        t = t.numpy()
    return t.tolist()


def kmeans(x, k, seed=None):
    """See https://github.com/zysite/biaffine-parser/blob/master/parser/utils/alg.py#L7

    Args:
      x(list): Lengths of sentences
      k(int): 
      seed:  (Default value = None)

    Returns:

    
    """
    x = tf.constant(x, dtype=tf.float32)
    # count the frequency of each datapoint
    d, indices, f = tf.unique_with_counts(x, tf.int32)
    f = tf.cast(f, tf.float32)
    # calculate the sum of the values of the same datapoints
    total = d * f
    # initialize k centroids randomly
    c, old = tf.random.shuffle(d, seed)[:k], None
    # assign labels to each datapoint based on centroids
    dists = tf.abs(tf.expand_dims(d, -1) - c)
    y = tf.argmin(dists, axis=-1, output_type=tf.int32)
    dists = tf.gather_nd(dists, tf.transpose(tf.stack([tf.range(tf.shape(dists)[0], dtype=tf.int32), y])))
    # make sure number of datapoints is greater than that of clusters
    assert len(d) >= k, f"unable to assign {len(d)} datapoints to {k} clusters"

    while old is None or not tf.reduce_all(c == old):
        # if an empty cluster is encountered,
        # choose the farthest datapoint from the biggest cluster
        # and move that the empty one
        for i in range(k):
            if not tf.reduce_any(y == i):
                mask = tf.cast(y == tf.expand_dims(tf.range(k, dtype=tf.int32), -1), tf.float32)
                lens = tf.reduce_sum(mask, axis=-1)
                biggest = view(nonzero(mask[tf.argmax(lens)]), -1)
                farthest = tf.argmax(tf.gather(dists, biggest))
                tf.tensor_scatter_nd_update(y, tf.expand_dims(tf.expand_dims(biggest[farthest], -1), -1), [i])
        mask = tf.cast(y == tf.expand_dims(tf.range(k, dtype=tf.int32), -1), tf.float32)
        # update the centroids
        c, old = tf.cast(tf.reduce_sum(total * mask, axis=-1), tf.float32) / tf.cast(tf.reduce_sum(f * mask, axis=-1),
                                                                                     tf.float32), c
        # re-assign all datapoints to clusters
        dists = tf.abs(tf.expand_dims(d, -1) - c)
        y = tf.argmin(dists, axis=-1, output_type=tf.int32)
        dists = tf.gather_nd(dists, tf.transpose(tf.stack([tf.range(tf.shape(dists)[0], dtype=tf.int32), y])))
    # assign all datapoints to the new-generated clusters
    # without considering the empty ones
    y, (assigned, _) = tf.gather(y, indices), tf.unique(y)
    # get the centroids of the assigned clusters
    centroids = tf.gather(c, assigned).numpy().tolist()
    # map all values of datapoints to buckets
    clusters = [tf.squeeze(tf.where(y == i), axis=-1).numpy().tolist() for i in assigned]

    return centroids, clusters


# ***************************************************************
class Tarjan:
    """Computes Tarjan's algorithm for finding strongly connected components (cycles) of a graph"""

    def __init__(self, prediction, tokens):
        """

        Parameters
        ----------
        prediction : numpy.ndarray
            a predicted dependency tree where prediction[dep_idx] = head_idx
        tokens : numpy.ndarray
            the tokens we care about (i.e. exclude _GO, _EOS, and _PAD)
        """
        self._edges = defaultdict(set)
        self._vertices = set((0,))
        for dep, head in enumerate(prediction[tokens]):
            self._vertices.add(dep + 1)
            self._edges[head].add(dep + 1)
        self._indices = {}
        self._lowlinks = {}
        self._onstack = defaultdict(lambda: False)
        self._SCCs = []

        index = 0
        stack = []
        for v in self.vertices:
            if v not in self.indices:
                self.strongconnect(v, index, stack)

    # =============================================================
    def strongconnect(self, v, index, stack):
        """

        Args:
          v: 
          index: 
          stack: 

        Returns:

        """

        self._indices[v] = index
        self._lowlinks[v] = index
        index += 1
        stack.append(v)
        self._onstack[v] = True
        for w in self.edges[v]:
            if w not in self.indices:
                self.strongconnect(w, index, stack)
                self._lowlinks[v] = min(self._lowlinks[v], self._lowlinks[w])
            elif self._onstack[w]:
                self._lowlinks[v] = min(self._lowlinks[v], self._indices[w])

        if self._lowlinks[v] == self._indices[v]:
            self._SCCs.append(set())
            while stack[-1] != v:
                w = stack.pop()
                self._onstack[w] = False
                self._SCCs[-1].add(w)
            w = stack.pop()
            self._onstack[w] = False
            self._SCCs[-1].add(w)
        return

    # ======================
    @property
    def edges(self):
        return self._edges

    @property
    def vertices(self):
        return self._vertices

    @property
    def indices(self):
        return self._indices

    @property
    def SCCs(self):
        return self._SCCs


def tarjan(parse_probs, length, tokens_to_keep, ensure_tree=True):
    """Adopted from Timothy Dozat https://github.com/tdozat/Parser/blob/master/lib/models/nn.py

    Args:
      parse_probs(NDArray): seq_len x seq_len, the probability of arcs
      length(NDArray): sentence length including ROOT
      tokens_to_keep(NDArray): mask matrix
      ensure_tree:  (Default value = True)

    Returns:

    
    """
    if ensure_tree:
        I = np.eye(len(tokens_to_keep))
        # block loops and pad heads
        parse_probs = parse_probs * tokens_to_keep * (1 - I)
        parse_preds = np.argmax(parse_probs, axis=1)
        tokens = np.arange(1, length)
        roots = np.where(parse_preds[tokens] == 0)[0] + 1
        # ensure at least one root
        if len(roots) < 1:
            # The current root probabilities
            root_probs = parse_probs[tokens, 0]
            # The current head probabilities
            old_head_probs = parse_probs[tokens, parse_preds[tokens]]
            # Get new potential root probabilities
            new_root_probs = root_probs / old_head_probs
            # Select the most probable root
            new_root = tokens[np.argmax(new_root_probs)]
            # Make the change
            parse_preds[new_root] = 0
        # ensure at most one root
        elif len(roots) > 1:
            # The probabilities of the current heads
            root_probs = parse_probs[roots, 0]
            # Set the probability of depending on the root zero
            parse_probs[roots, 0] = 0
            # Get new potential heads and their probabilities
            new_heads = np.argmax(parse_probs[roots][:, tokens], axis=1) + 1
            new_head_probs = parse_probs[roots, new_heads] / root_probs
            # Select the most probable root
            new_root = roots[np.argmin(new_head_probs)]
            # Make the change
            parse_preds[roots] = new_heads
            parse_preds[new_root] = 0
        # remove cycles
        tarjan = Tarjan(parse_preds, tokens)
        for SCC in tarjan.SCCs:
            if len(SCC) > 1:
                dependents = set()
                to_visit = set(SCC)
                while len(to_visit) > 0:
                    node = to_visit.pop()
                    if not node in dependents:
                        dependents.add(node)
                        to_visit.update(tarjan.edges[node])
                # The indices of the nodes that participate in the cycle
                cycle = np.array(list(SCC))
                # The probabilities of the current heads
                old_heads = parse_preds[cycle]
                old_head_probs = parse_probs[cycle, old_heads]
                # Set the probability of depending on a non-head to zero
                non_heads = np.array(list(dependents))
                parse_probs[np.repeat(cycle, len(non_heads)), np.repeat([non_heads], len(cycle), axis=0).flatten()] = 0
                # Get new potential heads and their probabilities
                new_heads = np.argmax(parse_probs[cycle][:, tokens], axis=1) + 1
                new_head_probs = parse_probs[cycle, new_heads] / old_head_probs
                # Select the most probable change
                change = np.argmax(new_head_probs)
                changed_cycle = cycle[change]
                old_head = old_heads[change]
                new_head = new_heads[change]
                # Make the change
                parse_preds[changed_cycle] = new_head
                tarjan.edges[new_head].add(changed_cycle)
                tarjan.edges[old_head].remove(changed_cycle)
        return parse_preds
    else:
        # block and pad heads
        parse_probs = parse_probs * tokens_to_keep
        parse_preds = np.argmax(parse_probs, axis=1)
        return parse_preds


def rel_argmax(rel_probs, length, root, ensure_tree=True):
    """Fix the relation prediction by heuristic rules

    Args:
      rel_probs(NDArray): seq_len x rel_size
      length: real sentence length
      ensure_tree:  (Default value = True)
      root: 

    Returns:

    
    """
    if ensure_tree:
        tokens = np.arange(1, length)
        rel_preds = np.argmax(rel_probs, axis=1)
        roots = np.where(rel_preds[tokens] == root)[0] + 1
        if len(roots) < 1:
            rel_preds[1 + np.argmax(rel_probs[tokens, root])] = root
        elif len(roots) > 1:
            root_probs = rel_probs[roots, root]
            rel_probs[roots, root] = 0
            new_rel_preds = np.argmax(rel_probs[roots], axis=1)
            new_rel_probs = rel_probs[roots, new_rel_preds] / root_probs
            new_root = roots[np.argmin(new_rel_probs)]
            rel_preds[roots] = new_rel_preds
            rel_preds[new_root] = root
        return rel_preds
    else:
        rel_preds = np.argmax(rel_probs, axis=1)
        return rel_preds


================================================
FILE: hanlp/components/parsers/biaffine/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-08 20:43


================================================
FILE: hanlp/components/parsers/biaffine/biaffine.py
================================================
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


import torch
import torch.nn as nn


class Biaffine(nn.Module):
    r"""
    Biaffine layer for first-order scoring.

    This function has a tensor of weights :math:`W` and bias terms if needed.
    The score :math:`s(x, y)` of the vector pair :math:`(x, y)` is computed as :math:`x^T W y`,
    in which :math:`x` and :math:`y` can be concatenated with bias terms.

    References:
        - Timothy Dozat and Christopher D. Manning. 2017.
          `Deep Biaffine Attention for Neural Dependency Parsing`_.

    Args:
        n_in (int):
            The size of the input feature.
        n_out (int):
            The number of output channels.
        bias_x (bool):
            If ``True``, adds a bias term for tensor :math:`x`. Default: ``True``.
        bias_y (bool):
            If ``True``, adds a bias term for tensor :math:`y`. Default: ``True``.

    .. _Deep Biaffine Attention for Neural Dependency Parsing:
        https://openreview.net/forum?id=Hk95PK9le
    """

    def __init__(self, n_in, n_out=1, bias_x=True, bias_y=True):
        super().__init__()

        self.n_in = n_in
        self.n_out = n_out
        self.bias_x = bias_x
        self.bias_y = bias_y
        self.weight = nn.Parameter(torch.Tensor(n_out, n_in + bias_x, n_in + bias_y))

        self.reset_parameters()

    def __repr__(self):
        s = f"n_in={self.n_in}, n_out={self.n_out}"
        if self.bias_x:
            s += f", bias_x={self.bias_x}"
        if self.bias_y:
            s += f", bias_y={self.bias_y}"

        return f"{self.__class__.__name__}({s})"

    def reset_parameters(self):
        nn.init.zeros_(self.weight)

    def forward(self, x, y):
        r"""
        Args:
            x (torch.Tensor): ``[batch_size, seq_len, n_in]``.
            y (torch.Tensor): ``[batch_size, seq_len, n_in]``.

        Returns:
            ~torch.Tensor:
                A scoring tensor of shape ``[batch_size, n_out, seq_len, seq_len]``.
                If ``n_out=1``, the dimension for ``n_out`` will be squeezed automatically.
        """

        if self.bias_x:
            x = torch.cat((x, torch.ones_like(x[..., :1])), -1)
        if self.bias_y:
            y = torch.cat((y, torch.ones_like(y[..., :1])), -1)
        # [batch_size, n_out, seq_len, seq_len]
        s = torch.einsum('bxi,oij,byj->boxy', x, self.weight, y)
        # remove dim 1 if n_out == 1
        s = s.squeeze(1)

        return s


================================================
FILE: hanlp/components/parsers/biaffine/biaffine_2nd_dep.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-06 13:57
import functools
from typing import Union, List, Any

import torch
from hanlp_common.constant import UNK
from hanlp.common.transform import TransformList
from hanlp.common.vocab import Vocab
from hanlp.components.parsers.biaffine.biaffine import Biaffine
from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDecoder, \
    EncoderWithContextualLayer
from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser
from hanlp.components.parsers.biaffine.biaffine_sdp import BiaffineSemanticDependencyParser
from hanlp_common.conll import CoNLLUWord, CoNLLSentence
from hanlp.components.parsers.parse_alg import add_secondary_arcs_by_preds
from hanlp.datasets.parsing.loaders.conll_dataset import append_bos
from hanlp.datasets.parsing.semeval15 import unpack_deps_to_head_deprel, merge_head_deprel_with_2nd
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs
from transformers import PreTrainedModel, PreTrainedTokenizer


class BiaffineSeparateDecoder(torch.nn.Module):

    def __init__(self, hidden_size, config) -> None:
        super().__init__()
        self.biaffine_decoder = BiaffineDecoder(hidden_size,
                                                config.n_mlp_arc,
                                                config.n_mlp_rel,
                                                config.mlp_dropout,
                                                config.n_rels)
        self.biaffine_decoder_2nd = BiaffineDecoder(hidden_size,
                                                    config.n_mlp_arc,
                                                    config.n_mlp_rel,
                                                    config.mlp_dropout,
                                                    config.n_rels_2nd)

    def forward(self, x, mask):
        return tuple(zip(self.biaffine_decoder(x, mask), self.biaffine_decoder_2nd(x, mask)))


class BiaffineJointDecoder(BiaffineDecoder):
    def __init__(self, hidden_size, config) -> None:
        super().__init__(hidden_size, config.n_mlp_arc, config.n_mlp_rel, config.mlp_dropout, config.n_rels)
        # the Biaffine layers for secondary dep
        self.arc_attn_2nd = Biaffine(n_in=config.n_mlp_arc,
                                     bias_x=True,
                                     bias_y=False)
        self.rel_attn_2nd = Biaffine(n_in=config.n_mlp_rel,
                                     n_out=config.n_rels,
                                     bias_x=True,
                                     bias_y=True)

    def forward(self, x, mask=None, **kwargs: Any):
        arc_d, arc_h, rel_d, rel_h = self.apply_mlps(x)
        s_arc, s_rel = self.decode(arc_d, arc_h, rel_d, rel_h, mask, self.arc_attn, self.rel_attn)
        s_arc_2nd, s_rel_2nd = self.decode(arc_d, arc_h, rel_d, rel_h, mask, self.arc_attn_2nd, self.rel_attn_2nd)
        return (s_arc, s_arc_2nd), (s_rel, s_rel_2nd)


class BiaffineSecondaryModel(torch.nn.Module):

    def __init__(self, config, pretrained_embed: torch.Tensor = None, transformer: PreTrainedModel = None,
                 transformer_tokenizer: PreTrainedTokenizer = None):
        super().__init__()
        self.encoder = EncoderWithContextualLayer(config, pretrained_embed, transformer, transformer_tokenizer)
        self.decoder = BiaffineJointDecoder(self.encoder.hidden_size, config) if config.joint \
            else BiaffineSeparateDecoder(self.encoder.hidden_size, config)

    def forward(self,
                words=None,
                feats=None,
                input_ids=None,
                token_span=None,
                mask=None, lens=None, **kwargs):
        x, mask = self.encoder(words, feats, input_ids, token_span, mask, lens)
        return self.decoder(x, mask)


class BiaffineSecondaryParser(BiaffineDependencyParser):

    def __init__(self) -> None:
        super().__init__()
        self.model: BiaffineSecondaryModel = None

    def build_dataset(self, data, bos_transform=None):
        transform = TransformList(functools.partial(append_bos, pos_key='UPOS'),
                                  functools.partial(unpack_deps_to_head_deprel, pad_rel=self.config.pad_rel,
                                                    arc_key='arc_2nd',
                                                    rel_key='rel_2nd'))
        if self.config.joint:
            transform.append(merge_head_deprel_with_2nd)
        if bos_transform:
            transform.append(bos_transform)
        return super().build_dataset(data, transform)

    def build_criterion(self, **kwargs):
        # noinspection PyCallByClass
        return super().build_criterion(**kwargs), (BiaffineSemanticDependencyParser.build_criterion(self, **kwargs))

    def fit(self, trn_data, dev_data, save_dir, feat=None, n_embed=100, pretrained_embed=None, transformer=None,
            average_subwords=False, word_dropout: float = 0.2, transformer_hidden_dropout=None, layer_dropout=0,
            scalar_mix: int = None, embed_dropout=.33, n_lstm_hidden=400, n_lstm_layers=3, hidden_dropout=.33,
            n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, lr=2e-3, transformer_lr=5e-5, mu=.9, nu=.9, epsilon=1e-12,
            clip=5.0, decay=.75, decay_steps=5000, patience=100, batch_size=None, sampler_builder=None,
            lowercase=False, epochs=50000, tree=False, punct=False, min_freq=2,
            apply_constraint=True, joint=False, no_cycle=False, root=None,
            logger=None,
            verbose=True, unk=UNK, pad_rel=None, max_sequence_length=512, devices: Union[float, int, List[int]] = None,
            transform=None, **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def build_vocabs(self, dataset, logger=None, transformer=None):
        self.vocabs['rel_2nd'] = rel_2nd = Vocab(pad_token=self.config.pad_rel, unk_token=self.config.pad_rel)
        if self.config.joint:
            self.vocabs['rel'] = rel_2nd
        super().build_vocabs(dataset, logger, transformer)
        self.config.n_rels_2nd = len(rel_2nd)

    def create_model(self, pretrained_embed, transformer):
        return BiaffineSecondaryModel(self.config, pretrained_embed, transformer, self.transformer_tokenizer)

    def compute_loss(self, arc_scores, rel_scores, arcs, rels, mask, criterion, batch=None):
        arc_scores_1st, arc_scores_2nd, rel_scores_1st, rel_scores_2nd = self.unpack_scores(arc_scores, rel_scores)
        loss_1st = super().compute_loss(arc_scores_1st, rel_scores_1st, arcs, rels, mask, criterion[0], batch)
        mask = self.compute_mask(arc_scores_2nd, batch, mask)
        # noinspection PyCallByClass
        loss_2st = BiaffineSemanticDependencyParser.compute_loss(self, arc_scores_2nd, rel_scores_2nd,
                                                                 batch['arc_2nd'], batch['rel_2nd_id'], mask,
                                                                 criterion[1], batch)
        return loss_1st + loss_2st

    @staticmethod
    def compute_mask(arc_scores_2nd, batch, mask_1st):
        mask = batch.get('mask_2nd', None)
        if mask is None:
            batch['mask_2nd'] = mask = BiaffineSemanticDependencyParser.convert_to_3d_mask(arc_scores_2nd, mask_1st)
        return mask

    def unpack_scores(self, arc_scores, rel_scores):
        arc_scores_1st, arc_scores_2nd = arc_scores
        rel_scores_1st, rel_scores_2nd = rel_scores
        return arc_scores_1st, arc_scores_2nd, rel_scores_1st, rel_scores_2nd

    def get_pad_dict(self):
        d = super(BiaffineSecondaryParser, self).get_pad_dict()
        d.update({'arc_2nd': False})
        return d

    def decode(self, arc_scores, rel_scores, mask, batch=None, predicting=None):
        output_1st, output_2nd = batch.get('outputs', (None, None))
        if output_1st is None:
            arc_scores_1st, arc_scores_2nd, rel_scores_1st, rel_scores_2nd = self.unpack_scores(arc_scores, rel_scores)
            output_1st = super().decode(arc_scores_1st, rel_scores_1st, mask)
            mask = self.compute_mask(arc_scores_2nd, batch, mask)
            # noinspection PyCallByClass
            output_2nd = BiaffineSemanticDependencyParser.decode(self, arc_scores_2nd, rel_scores_2nd, mask, batch)
            if self.config.get('no_cycle'):
                assert predicting, 'No cycle constraint for evaluation is not implemented yet. If you are ' \
                                   'interested, welcome to submit a pull request.'
                root_rel_idx = self.vocabs['rel'].token_to_idx.get(self.config.get('root', None), None)
                arc_pred_1st, rel_pred_1st, arc_pred_2nd, rel_pred_2nd = *output_1st, *output_2nd
                arc_scores_2nd = arc_scores_2nd.transpose(1, 2).cpu().detach().numpy()
                arc_pred_2nd = arc_pred_2nd.cpu().detach().numpy()
                rel_pred_2nd = rel_pred_2nd.cpu().detach().numpy()
                trees = arc_pred_1st.cpu().detach().numpy()
                graphs = []
                for i, (arc_scores, arc_preds, rel_preds, tree, tokens) in enumerate(
                        zip(arc_scores_2nd, arc_pred_2nd, rel_pred_2nd, trees, batch['token'])):
                    sent_len = len(tokens)
                    graph = add_secondary_arcs_by_preds(arc_scores, arc_preds[:sent_len, :sent_len], rel_preds,
                                                        tree[:sent_len], root_rel_idx)
                    graphs.append(graph[1:])  # Remove root
                    # if not predicting:
                    #     # Write back to torch Tensor
                    #     for d, hr in zip(graph):
                    #         pass
                output_2nd = None, graphs

        return tuple(zip(output_1st, output_2nd))

    def update_metric(self, arc_preds, rel_preds, arcs, rels, mask, puncts, metric, batch=None):
        super().update_metric(arc_preds[0], rel_preds[0], arcs, rels, mask, puncts, metric['1st'], batch)
        puncts = BiaffineSemanticDependencyParser.convert_to_3d_puncts(puncts, batch['mask_2nd'])
        # noinspection PyCallByClass
        BiaffineSemanticDependencyParser.update_metric(self, arc_preds[1], rel_preds[1], batch['arc_2nd'],
                                                       batch['rel_2nd_id'], batch['mask_2nd'], puncts, metric['2nd'],
                                                       batch)

    def build_metric(self, **kwargs):
        # noinspection PyCallByClass
        return MetricDict({'1st': super().build_metric(**kwargs),
                           '2nd': BiaffineSemanticDependencyParser.build_metric(self, **kwargs)})

    def collect_outputs_extend(self, predictions: list, arc_preds, rel_preds, lens, mask):
        predictions.extend(rel_preds[1])

    def predictions_to_human(self, predictions, outputs, data, use_pos, conll=True):
        rel_vocab = self.vocabs['rel'].idx_to_token
        for d, graph in zip(data, predictions):
            sent = CoNLLSentence()
            for idx, (cell, hrs) in enumerate(zip(d, graph)):
                if use_pos:
                    token, pos = cell
                else:
                    token, pos = cell, None
                head = hrs[0][0]
                deprel = rel_vocab[hrs[0][1]]
                deps = [(h, rel_vocab[r]) for h, r in hrs[1:]]
                sent.append(CoNLLUWord(idx + 1, token, upos=pos, head=head, deprel=deprel, deps=deps))
            outputs.append(sent)


================================================
FILE: hanlp/components/parsers/biaffine/biaffine_dep.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-08 20:51
import os
from collections import Counter
from typing import Union, Any, List

from hanlp.layers.transformers.pt_imports import PreTrainedTokenizer, AutoModel_, AutoTokenizer_
import torch
from hanlp.utils.torch_util import lengths_to_mask
from torch import nn
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR
from torch.utils.data import DataLoader
from hanlp_common.constant import ROOT, UNK, IDX
from hanlp.common.dataset import PadSequenceDataLoader
from hanlp.common.structure import History
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import LowerCase, FieldLength, PunctuationMask
from hanlp.common.vocab import Vocab
from hanlp.components.parsers.alg import decode_dep
from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDependencyModel
from hanlp_common.conll import CoNLLWord, CoNLLSentence
from hanlp.datasets.parsing.loaders.conll_dataset import CoNLLParsingDataset, append_bos
from hanlp.layers.embeddings.util import index_word2vec_with_vocab
from hanlp.layers.transformers.utils import build_optimizer_scheduler_with_transformer
from hanlp.metrics.parsing.attachmentscore import AttachmentScore
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import isdebugging, merge_locals_kwargs, merge_dict, reorder


class BiaffineDependencyParser(TorchComponent):
    def __init__(self) -> None:
        """Biaffine dependency parsing (:cite:`dozat:17a`).
        """
        super().__init__()
        self.model: BiaffineDependencyModel = None
        self.transformer_tokenizer: PreTrainedTokenizer = None

    def predict(self, data: Any, batch_size=None, batch_max_tokens=None, conll=True, **kwargs):
        if not data:
            return []
        use_pos = self.use_pos
        flat = self.input_is_flat(data, use_pos)
        if flat:
            data = [data]
        samples = self.build_samples(data, use_pos)
        if not batch_max_tokens:
            batch_max_tokens = self.config.get('batch_max_tokens', None)
        if not batch_size:
            batch_size = self.config.batch_size
        dataloader = self.build_dataloader(samples,
                                           device=self.devices[0], shuffle=False,
                                           **merge_dict(self.config,
                                                        batch_size=batch_size,
                                                        batch_max_tokens=batch_max_tokens,
                                                        overwrite=True,
                                                        **kwargs))
        predictions, build_data, data, order = self.before_outputs(data)
        for batch in dataloader:
            arc_scores, rel_scores, mask, puncts = self.feed_batch(batch)
            self.collect_outputs(arc_scores, rel_scores, mask, batch, predictions, order, data, use_pos,
                                 build_data)
        outputs = self.post_outputs(predictions, data, order, use_pos, build_data, conll=conll)
        if flat:
            return outputs[0]
        return outputs

    def build_samples(self, data, use_pos=None):
        samples = []
        pos_key = 'CPOS' if 'CPOS' in self.vocabs else 'UPOS'
        for idx, each in enumerate(data):
            sample = {IDX: idx}
            if use_pos:
                token, pos = zip(*each)
                sample.update({'FORM': list(token), pos_key: list(pos)})
            else:
                token = each
                sample.update({'FORM': list(token)})
            samples.append(sample)
        return samples

    def input_is_flat(self, data, use_pos=None):
        if use_pos is None:
            use_pos = 'CPOS' in self.vocabs
        if use_pos:
            flat = isinstance(data[0], (list, tuple)) and isinstance(data[0][0], str)
        else:
            flat = isinstance(data[0], str)
        return flat

    def before_outputs(self, data):
        predictions, order = [], []
        build_data = data is None
        if build_data:
            data = []
        return predictions, build_data, data, order

    def post_outputs(self, predictions, data, order, use_pos, build_data, conll=True):
        predictions = reorder(predictions, order)
        if build_data:
            data = reorder(data, order)
        outputs = []
        self.predictions_to_human(predictions, outputs, data, use_pos, conll=conll)
        return outputs

    def predictions_to_human(self, predictions, outputs, data, use_pos, conll=True):
        if conll:
            for d, (arcs, rels) in zip(data, predictions):
                sent = CoNLLSentence()
                for idx, (cell, a, r) in enumerate(zip(d, arcs, rels)):
                    if use_pos:
                        token, pos = cell
                    else:
                        token, pos = cell, None
                    sent.append(CoNLLWord(idx + 1, token, cpos=pos, head=a, deprel=self.vocabs['rel'][r]))
                outputs.append(sent)
        else:
            for d, (arcs, rels) in zip(data, predictions):
                sent = []
                for idx, (a, r) in enumerate(zip(arcs, rels)):
                    sent.append((a, self.vocabs['rel'][r]))
                outputs.append(sent)

    def collect_outputs(self, arc_scores, rel_scores, mask, batch, predictions, order, data, use_pos,
                        build_data):
        lens = [len(token) - 1 for token in batch['token']]
        arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask, batch)
        self.collect_outputs_extend(predictions, arc_preds, rel_preds, lens, mask)
        order.extend(batch[IDX])
        if build_data:
            if use_pos:
                data.extend(zip(batch['FORM'], batch['CPOS']))
            else:
                data.extend(batch['FORM'])

    def collect_outputs_extend(self, predictions: list, arc_preds, rel_preds, lens, mask):
        predictions.extend(zip([seq.tolist() for seq in arc_preds[mask].split(lens)],
                               [seq.tolist() for seq in rel_preds[mask].split(lens)]))

    @property
    def use_pos(self):
        return self.config.get('feat', None) == 'pos'

    def fit(self, trn_data, dev_data, save_dir,
            feat=None,
            n_embed=100,
            pretrained_embed=None,
            transformer=None,
            average_subwords=False,
            word_dropout=0.2,
            transformer_hidden_dropout=None,
            layer_dropout=0,
            scalar_mix: int = None,
            embed_dropout=.33,
            n_lstm_hidden=400,
            n_lstm_layers=3,
            hidden_dropout=.33,
            n_mlp_arc=500,
            n_mlp_rel=100,
            mlp_dropout=.33,
            lr=2e-3,
            transformer_lr=5e-5,
            mu=.9,
            nu=.9,
            epsilon=1e-12,
            grad_norm=5.0,
            decay=.75,
            decay_steps=5000,
            weight_decay=0,
            warmup_steps=0.1,
            separate_optimizer=False,
            patience=100,
            lowercase=False,
            epochs=50000,
            tree=False,
            proj=False,
            punct=False,
            min_freq=2,
            logger=None,
            verbose=True,
            unk=UNK,
            max_sequence_length=512,
            batch_size=None,
            sampler_builder=None,
            gradient_accumulation=1,
            devices: Union[float, int, List[int]] = None,
            transform=None,
            secondary_encoder=None,
            **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def execute_training_loop(self, trn, dev, devices, epochs, logger, patience, save_dir, optimizer,
                              gradient_accumulation, **kwargs):
        optimizer, scheduler, transformer_optimizer, transformer_scheduler = optimizer
        criterion = self.build_criterion()
        best_e, best_metric = 0, self.build_metric()
        timer = CountdownTimer(epochs)
        history = History()
        ratio_width = len(f'{len(trn) // gradient_accumulation}/{len(trn) // gradient_accumulation}')
        for epoch in range(1, epochs + 1):
            # train one epoch and update the parameters
            logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
            self.fit_dataloader(trn, optimizer, scheduler, criterion, epoch, logger, history,
                                transformer_optimizer, transformer_scheduler,
                                gradient_accumulation=gradient_accumulation)
            loss, dev_metric = self.evaluate_dataloader(dev, criterion, ratio_width=ratio_width, logger=logger)
            timer.update()
            # logger.info(f"{'Dev' + ' ' * ratio_width} loss: {loss:.4f} {dev_metric}")
            # save the model if it is the best so far
            report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}"
            if dev_metric > best_metric:
                best_e, best_metric = epoch, dev_metric
                self.save_weights(save_dir)
                report += ' ([red]saved[/red])'
            else:
                if patience != epochs:
                    report += f' ({epoch - best_e}/{patience})'
                else:
                    report += f' ({epoch - best_e})'
            logger.info(report)
            if patience is not None and epoch - best_e >= patience:
                logger.info(f'LAS has stopped improving for {patience} epochs, early stop.')
                break
        timer.stop()
        if not best_e:
            self.save_weights(save_dir)
        elif best_e != epoch:
            self.load_weights(save_dir)
        logger.info(f"Max score of dev is {best_metric.score:.2%} at epoch {best_e}")
        logger.info(f"Average time of each epoch is {timer.elapsed_average_human}")
        logger.info(f"{timer.elapsed_human} elapsed")

    def build_optimizer(self, epochs, trn, gradient_accumulation, **kwargs):
        config = self.config
        model = self.model
        if isinstance(model, nn.DataParallel):
            model = model.module
        if self.config.transformer:
            transformer = model.encoder.transformer
            optimizer = Adam(set(model.parameters()) - set(transformer.parameters()),
                             config.lr,
                             (config.mu, config.nu),
                             config.epsilon)
            if self.config.transformer_lr:
                num_training_steps = len(trn) * epochs // gradient_accumulation
                if self.config.separate_optimizer:
                    transformer_optimizer, transformer_scheduler = \
                        build_optimizer_scheduler_with_transformer(transformer,
                                                                   transformer,
                                                                   config.transformer_lr,
                                                                   config.transformer_lr,
                                                                   num_training_steps,
                                                                   config.warmup_steps,
                                                                   config.weight_decay,
                                                                   adam_epsilon=1e-8)
                else:
                    optimizer, scheduler = build_optimizer_scheduler_with_transformer(model,
                                                                                      transformer,
                                                                                      config.lr,
                                                                                      config.transformer_lr,
                                                                                      num_training_steps,
                                                                                      config.warmup_steps,
                                                                                      config.weight_decay,
                                                                                      adam_epsilon=1e-8)
                    transformer_optimizer, transformer_scheduler = None, None
            else:
                transformer.requires_grad_(False)
                transformer_optimizer, transformer_scheduler = None, None
        else:
            optimizer = Adam(model.parameters(),
                             config.lr,
                             (config.mu, config.nu),
                             config.epsilon)
            transformer_optimizer, transformer_scheduler = None, None
        if self.config.separate_optimizer:
            scheduler = ExponentialLR(optimizer, config.decay ** (1 / config.decay_steps))
        # noinspection PyUnboundLocalVariable
        return optimizer, scheduler, transformer_optimizer, transformer_scheduler

    def build_transformer_tokenizer(self):
        transformer = self.config.transformer
        if transformer:
            transformer_tokenizer: PreTrainedTokenizer = AutoTokenizer_.from_pretrained(transformer, use_fast=True)
        else:
            transformer_tokenizer = None
        self.transformer_tokenizer = transformer_tokenizer
        return transformer_tokenizer

    # noinspection PyMethodOverriding
    def build_dataloader(self,
                         data,
                         shuffle,
                         device,
                         training=False,
                         logger=None,
                         gradient_accumulation=1,
                         sampler_builder=None,
                         batch_size=None,
                         **kwargs) -> DataLoader:
        dataset = self.build_dataset(data)
        if self.vocabs.mutable:
            self.build_vocabs(dataset, logger, self.config.transformer)
        transformer_tokenizer = self.transformer_tokenizer
        if transformer_tokenizer:
            dataset.transform.append(self.build_tokenizer_transform())
        dataset.append_transform(FieldLength('token', 'sent_length'))
        if isinstance(data, str):
            dataset.purge_cache()
        if len(dataset) > 1000 and isinstance(data, str):
            timer = CountdownTimer(len(dataset))
            self.cache_dataset(dataset, timer, training, logger)
        if self.config.transformer:
            lens = [len(sample['input_ids']) for sample in dataset]
        else:
            lens = [sample['sent_length'] for sample in dataset]
        if sampler_builder:
            sampler = sampler_builder.build(lens, shuffle, gradient_accumulation)
        else:
            sampler = None
        loader = PadSequenceDataLoader(dataset=dataset,
                                       batch_sampler=sampler,
                                       batch_size=batch_size,
                                       pad=self.get_pad_dict(),
                                       device=device,
                                       vocabs=self.vocabs)
        return loader

    def cache_dataset(self, dataset, timer, training=False, logger=None):
        for each in dataset:
            timer.log('Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]')

    def get_pad_dict(self):
        return {'arc': 0}

    def build_dataset(self, data, bos_transform=None):
        if not bos_transform:
            bos_transform = append_bos
        transform = [bos_transform]
        if self.config.get('transform', None):
            transform.append(self.config.transform)
        if self.config.get('lowercase', False):
            transform.append(LowerCase('token'))
        transform.append(self.vocabs)
        if not self.config.punct:
            transform.append(PunctuationMask('token', 'punct_mask'))
        return CoNLLParsingDataset(data, transform=transform)

    def build_tokenizer_transform(self):
        return TransformerSequenceTokenizer(self.transformer_tokenizer, 'token', '',
                                            ret_token_span=True, cls_is_bos=True,
                                            max_seq_length=self.config.get('max_sequence_length',
                                                                           512),
                                            truncate_long_sequences=False)

    def build_vocabs(self, dataset, logger=None, transformer=None):
        rel_vocab = self.vocabs.get('rel', None)
        if rel_vocab is None:
            rel_vocab = Vocab(unk_token=None, pad_token=self.config.get('pad_rel', None))
            self.vocabs.put(rel=rel_vocab)
        if self.config.get('feat', None) == 'pos' or self.config.get('use_pos', False):
            self.vocabs['pos'] = Vocab(unk_token=None, pad_token=None)

        timer = CountdownTimer(len(dataset))
        if transformer:
            token_vocab = None
        else:
            token_vocab = Vocab()
            self.vocabs.token = token_vocab
            unk = self.config.get('unk', None)
            if unk is not None:
                token_vocab.unk_token = unk
        if token_vocab and self.config.get('min_freq', None):
            counter = Counter()
            for sample in dataset:
                for form in sample['token']:
                    counter[form] += 1
            reserved_token = [token_vocab.pad_token, token_vocab.unk_token]
            if ROOT in token_vocab:
                reserved_token.append(ROOT)
            freq_words = reserved_token + [token for token, freq in counter.items() if
                                           freq >= self.config.min_freq]
            token_vocab.token_to_idx.clear()
            for word in freq_words:
                token_vocab(word)
        else:
            for i, sample in enumerate(dataset):
                timer.log('vocab building [blink][yellow]...[/yellow][/blink]', ratio_percentage=True)
        rel_vocab.set_unk_as_safe_unk()  # Some relation in dev set is OOV
        self.vocabs.lock()
        self.vocabs.summary(logger=logger)
        if token_vocab:
            self.config.n_words = len(self.vocabs['token'])
        if 'pos' in self.vocabs:
            self.config.n_feats = len(self.vocabs['pos'])
            self.vocabs['pos'].set_unk_as_safe_unk()
        self.config.n_rels = len(self.vocabs['rel'])
        if token_vocab:
            self.config.pad_index = self.vocabs['token'].pad_idx
            self.config.unk_index = self.vocabs['token'].unk_idx

    def build_model(self, training=True, **kwargs) -> torch.nn.Module:
        pretrained_embed, transformer = self.build_embeddings(training=training)
        if pretrained_embed is not None:
            self.config.n_embed = pretrained_embed.size(-1)
        model = self.create_model(pretrained_embed, transformer)
        return model

    def create_model(self, pretrained_embed, transformer):
        return BiaffineDependencyModel(self.config,
                                       pretrained_embed,
                                       transformer,
                                       self.transformer_tokenizer)

    def build_embeddings(self, training=True):
        pretrained_embed = None
        if self.config.get('pretrained_embed', None):
            pretrained_embed = index_word2vec_with_vocab(self.config.pretrained_embed, self.vocabs['token'],
                                                         init='zeros', normalize=True)
        transformer = self.config.transformer
        if transformer:
            transformer = AutoModel_.from_pretrained(transformer, training=training)
        return pretrained_embed, transformer

    # noinspection PyMethodOverriding
    def fit_dataloader(self,
                       trn,
                       optimizer,
                       scheduler,
                       criterion,
                       epoch,
                       logger,
                       history: History,
                       transformer_optimizer=None,
                       transformer_scheduler=None,
                       gradient_accumulation=1,
                       **kwargs):
        self.model.train()

        timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation))
        metric = self.build_metric(training=True)
        total_loss = 0
        for idx, batch in enumerate(trn):
            arc_scores, rel_scores, mask, puncts = self.feed_batch(batch)
            arcs, rels = batch['arc'], batch['rel_id']
            loss = self.compute_loss(arc_scores, rel_scores, arcs, rels, mask, criterion, batch)
            if gradient_accumulation > 1:
                loss /= gradient_accumulation
            loss.backward()
            total_loss += loss.item()
            arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask, batch)
            self.update_metric(arc_preds, rel_preds, arcs, rels, mask, puncts, metric, batch)
            if history.step(gradient_accumulation):
                self._step(optimizer, scheduler, transformer_optimizer, transformer_scheduler)
                report = self._report(total_loss / (timer.current + 1), metric)
                timer.log(report, ratio_percentage=False, logger=logger)
            del loss

    def _step(self, optimizer, scheduler, transformer_optimizer, transformer_scheduler):
        if self.config.get('grad_norm', None):
            nn.utils.clip_grad_norm_(self.model.parameters(),
                                     self.config.grad_norm)
        optimizer.step()
        optimizer.zero_grad()
        scheduler.step()
        if self.config.transformer and self.config.transformer_lr and transformer_optimizer:
            transformer_optimizer.step()
            transformer_optimizer.zero_grad()
            transformer_scheduler.step()

    def feed_batch(self, batch):
        words, feats, lens, puncts = batch.get('token_id', None), batch.get('pos_id', None), batch['sent_length'], \
                                     batch.get('punct_mask', None)
        mask = lengths_to_mask(lens)
        arc_scores, rel_scores = self.model(words=words, feats=feats, mask=mask, batch=batch, **batch)
        # ignore the first token of each sentence
        # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
        if self.model.training:
            mask = mask.clone()
        mask[:, 0] = 0
        return arc_scores, rel_scores, mask, puncts

    def _report(self, loss, metric: AttachmentScore):
        return f'loss: {loss:.4f} {metric}'

    def compute_loss(self, arc_scores, rel_scores, arcs, rels, mask, criterion, batch=None):
        arc_scores, arcs = arc_scores[mask], arcs[mask]
        rel_scores, rels = rel_scores[mask], rels[mask]
        rel_scores = rel_scores[torch.arange(len(arcs)), arcs]
        arc_loss = criterion(arc_scores, arcs)
        rel_loss = criterion(rel_scores, rels)
        loss = arc_loss + rel_loss

        return loss

    # noinspection PyUnboundLocalVariable
    @torch.no_grad()
    def evaluate_dataloader(self, loader: PadSequenceDataLoader, criterion, logger=None, filename=None, output=False,
                            ratio_width=None,
                            metric=None,
                            **kwargs):
        self.model.eval()

        loss = 0
        if not metric:
            metric = self.build_metric()
        if output:
            fp = open(output, 'w')
            predictions, build_data, data, order = self.before_outputs(None)

        timer = CountdownTimer(len(loader))
        use_pos = self.use_pos
        for batch in loader:
            arc_scores, rel_scores, mask, puncts = self.feed_batch(batch)
            if output:
                self.collect_outputs(arc_scores, rel_scores, mask, batch, predictions, order, data, use_pos,
                                     build_data)
            arcs, rels = batch['arc'], batch['rel_id']
            loss += self.compute_loss(arc_scores, rel_scores, arcs, rels, mask, criterion, batch).item()
            arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask, batch)
            self.update_metric(arc_preds, rel_preds, arcs, rels, mask, puncts, metric, batch)
            report = self._report(loss / (timer.current + 1), metric)
            if filename:
                report = f'{os.path.basename(filename)} ' + report
            timer.log(report, ratio_percentage=False, logger=logger, ratio_width=ratio_width)
        loss /= len(loader)
        if output:
            outputs = self.post_outputs(predictions, data, order, use_pos, build_data)
            for each in outputs:
                fp.write(f'{each}\n\n')
            fp.close()
            logger.info(f'Predictions saved in [underline][yellow]{output}[/yellow][/underline]')

        return loss, metric

    def update_metric(self, arc_preds, rel_preds, arcs, rels, mask, puncts, metric, batch=None):
        # ignore all punctuation if not specified
        if not self.config.punct:
            mask &= puncts
        metric(arc_preds, rel_preds, arcs, rels, mask)

    def decode(self, arc_scores, rel_scores, mask, batch=None):
        tree, proj = self.config.tree, self.config.get('proj', False)
        if tree:
            arc_preds = decode_dep(arc_scores, mask, tree, proj)
        else:
            arc_preds = arc_scores.argmax(-1)
        rel_preds = rel_scores.argmax(-1)
        rel_preds = rel_preds.gather(-1, arc_preds.unsqueeze(-1)).squeeze(-1)

        return arc_preds, rel_preds

    def build_criterion(self, **kwargs):
        criterion = nn.CrossEntropyLoss()
        return criterion

    def build_metric(self, **kwargs):
        return AttachmentScore()

    def on_config_ready(self, **kwargs):
        self.build_transformer_tokenizer()  # We have to build tokenizer before building the dataloader and model
        self.config.patience = min(self.config.patience, self.config.epochs)

    def prediction_to_head_rel(self, arcs: torch.LongTensor, rels: torch.LongTensor, batch: dict):
        arcs = arcs[:, 1:]  # Skip the ROOT
        rels = rels[:, 1:]
        arcs = arcs.tolist()
        rels = rels.tolist()
        vocab = self.vocabs['rel'].idx_to_token
        for arcs_per_sent, rels_per_sent, tokens in zip(arcs, rels, batch['token']):
            tokens = tokens[1:]
            sent_len = len(tokens)
            result = list(zip(arcs_per_sent[:sent_len], [vocab[r] for r in rels_per_sent[:sent_len]]))
            yield result


================================================
FILE: hanlp/components/parsers/biaffine/biaffine_model.py
================================================
# -*- coding: utf-8 -*-
from typing import Any, Tuple

import torch
import torch.nn as nn
from torch.nn.utils.rnn import (pack_padded_sequence, pad_packed_sequence,
                                pad_sequence)

from hanlp.components.parsers.biaffine.biaffine import Biaffine
from hanlp.components.parsers.biaffine.mlp import MLP
from hanlp.components.parsers.biaffine.variationalbilstm import VariationalLSTM
from hanlp.layers.dropout import IndependentDropout, SharedDropout, WordDropout
from hanlp.layers.transformers.encoder import TransformerEncoder
from hanlp.layers.transformers.pt_imports import PreTrainedModel, PreTrainedTokenizer
from hanlp.layers.transformers.utils import transformer_encode


class EncoderWithContextualLayer(nn.Module):
    def __init__(self,
                 config,
                 pretrained_embed: torch.Tensor = None,
                 transformer: PreTrainedModel = None,
                 transformer_tokenizer: PreTrainedTokenizer = None,
                 ):
        super(EncoderWithContextualLayer, self).__init__()

        self.secondary_encoder = config.get('secondary_encoder', None)
        self.config = config

        if not transformer:
            self.pad_index = config.pad_index
            self.unk_index = config.unk_index
            if config.word_dropout:
                oov = self.unk_index
                excludes = [self.pad_index]
                self.word_dropout = WordDropout(p=config.word_dropout, oov_token=oov, exclude_tokens=excludes)
            else:
                self.word_dropout = None
        if transformer:
            input_size = 0
            if self.config.transformer_lr:
                hidden_size = transformer.config.hidden_size
            else:
                input_size = transformer.config.hidden_size
                hidden_size = config.n_lstm_hidden * 2
            if config.feat == 'pos':
                self.feat_embed = nn.Embedding(num_embeddings=config.n_feats,
                                               embedding_dim=config.n_embed)
                self.embed_dropout = IndependentDropout(p=config.embed_dropout)
                if self.config.transformer_lr:
                    hidden_size += config.n_embed
                else:
                    input_size += config.n_embed
            if not self.config.transformer_lr:
                self.lstm = VariationalLSTM(input_size=input_size,
                                            hidden_size=config.n_lstm_hidden,
                                            num_layers=config.n_lstm_layers,
                                            dropout=config.hidden_dropout, bidirectional=True)
        else:
            # the embedding layer
            input_size = config.n_embed
            self.word_embed = nn.Embedding(num_embeddings=config.n_words,
                                           embedding_dim=config.n_embed)
            if pretrained_embed is not None:
                if not isinstance(pretrained_embed, torch.Tensor):
                    pretrained_embed = torch.Tensor(pretrained_embed)
                self.pretrained = nn.Embedding.from_pretrained(pretrained_embed)
                nn.init.zeros_(self.word_embed.weight)
            if config.feat == 'pos':
                self.feat_embed = nn.Embedding(num_embeddings=config.n_feats,
                                               embedding_dim=config.n_embed)
                self.embed_dropout = IndependentDropout(p=config.embed_dropout)
                input_size += config.n_embed

            # the word-lstm layer
            hidden_size = config.n_lstm_hidden * 2
            self.lstm = VariationalLSTM(input_size=input_size,
                                        hidden_size=config.n_lstm_hidden,
                                        num_layers=config.n_lstm_layers,
                                        dropout=config.hidden_dropout, bidirectional=True)
        self.hidden_size = hidden_size
        self.hidden_dropout = SharedDropout(p=config.hidden_dropout)
        if transformer:
            transformer = TransformerEncoder(transformer, transformer_tokenizer, config.average_subwords,
                                             word_dropout=config.word_dropout,
                                             max_sequence_length=config.max_sequence_length)
        self.transformer = transformer

    def forward(self, words, feats, input_ids, token_span, mask, lens):
        if mask is None:
            # get the mask and lengths of given batch
            mask = words.ne(self.pad_index)
        if lens is None:
            lens = mask.sum(dim=1)
        batch_size, seq_len = mask.shape
        if self.config.transformer:
            # trans_embed = self.run_transformer(input_ids, token_span=token_span)
            trans_embed = self.transformer.forward(input_ids, token_span=token_span)
            if hasattr(self, 'feat_embed'):
                feat_embed = self.feat_embed(feats)
                trans_embed, feat_embed = self.embed_dropout(trans_embed, feat_embed)
                embed = torch.cat((trans_embed, feat_embed), dim=-1)
            else:
                embed = trans_embed
            if hasattr(self, 'lstm'):
                x = self.run_rnn(embed, lens, seq_len)
            else:
                x = embed
            if self.secondary_encoder:
                x = self.secondary_encoder(x, mask)
            x = self.hidden_dropout(x)
        else:
            if self.word_dropout:
                words = self.word_dropout(words)
            # set the indices larger than num_embeddings to unk_index
            ext_mask = words.ge(self.word_embed.num_embeddings)
            ext_words = words.masked_fill(ext_mask, self.unk_index)

            # get outputs from embedding layers
            word_embed = self.word_embed(ext_words)
            if hasattr(self, 'pretrained'):
                word_embed += self.pretrained(words)
            if self.config.feat == 'char':
                feat_embed = self.feat_embed(feats[mask])
                feat_embed = pad_sequence(feat_embed.split(lens.tolist()), True)
            elif self.config.feat == 'bert':
                feat_embed = self.feat_embed(*feats)
            elif hasattr(self, 'feat_embed'):
                feat_embed = self.feat_embed(feats)
            else:
                feat_embed = None
            if feat_embed is not None:
                word_embed, feat_embed = self.embed_dropout(word_embed, feat_embed)
                # concatenate the word and feat representations
                embed = torch.cat((word_embed, feat_embed), dim=-1)
            else:
                embed = word_embed

            x = self.run_rnn(embed, lens, seq_len)
            x = self.hidden_dropout(x)
        return x, mask

    def run_rnn(self, embed, lens, seq_len):
        x = pack_padded_sequence(embed, lens, True, False)
        x, _ = self.lstm(x)
        x, _ = pad_packed_sequence(x, True, total_length=seq_len)
        return x

    def run_transformer(self, input_ids, token_span):
        return transformer_encode(self.transformer, input_ids, None, None, token_span,
                                  average_subwords=self.config.average_subwords)


class BiaffineDecoder(nn.Module):
    def __init__(self, hidden_size, n_mlp_arc, n_mlp_rel, mlp_dropout, n_rels, arc_dropout=None,
                 rel_dropout=None) -> None:
        super().__init__()
        # the MLP layers
        self.mlp_arc_h = MLP(hidden_size,
                             n_mlp_arc,
                             dropout=arc_dropout or mlp_dropout)
        self.mlp_arc_d = MLP(hidden_size,
                             n_mlp_arc,
                             dropout=arc_dropout or mlp_dropout)
        self.mlp_rel_h = MLP(hidden_size,
                             n_mlp_rel,
                             dropout=rel_dropout or mlp_dropout)
        self.mlp_rel_d = MLP(hidden_size,
                             n_mlp_rel,
                             dropout=rel_dropout or mlp_dropout)

        # the Biaffine layers
        self.arc_attn = Biaffine(n_in=n_mlp_arc,
                                 bias_x=True,
                                 bias_y=False)
        self.rel_attn = Biaffine(n_in=n_mlp_rel,
                                 n_out=n_rels,
                                 bias_x=True,
                                 bias_y=True)

    def forward(self, x, mask=None, **kwargs: Any) -> Tuple[torch.Tensor, torch.Tensor]:
        arc_d, arc_h, rel_d, rel_h = self.apply_mlps(x)

        s_arc, s_rel = self.decode(arc_d, arc_h, rel_d, rel_h, mask, self.arc_attn, self.rel_attn)

        return s_arc, s_rel

    @staticmethod
    def decode(arc_d, arc_h, rel_d, rel_h, mask, arc_attn, rel_attn):
        # get arc and rel scores from the bilinear attention
        # [batch_size, seq_len, seq_len]
        s_arc = arc_attn(arc_d, arc_h)
        # [batch_size, seq_len, seq_len, n_rels]
        s_rel = rel_attn(rel_d, rel_h).permute(0, 2, 3, 1)
        if mask is not None:
            # set the scores that exceed the length of each sentence to -inf
            s_arc.masked_fill_(~mask.unsqueeze(1), float('-inf'))
        return s_arc, s_rel

    def apply_mlps(self, x):
        # apply MLPs to the hidden states
        arc_d = self.mlp_arc_d(x)
        arc_h = self.mlp_arc_h(x)
        rel_d = self.mlp_rel_d(x)
        rel_h = self.mlp_rel_h(x)
        return arc_d, arc_h, rel_d, rel_h


class BiaffineDependencyModel(nn.Module):

    def __init__(self, config, pretrained_embed: torch.Tensor = None, transformer: PreTrainedModel = None,
                 transformer_tokenizer: PreTrainedTokenizer = None):
        super().__init__()
        self.encoder = EncoderWithContextualLayer(config, pretrained_embed, transformer, transformer_tokenizer)
        self.biaffine_decoder = BiaffineDecoder(self.encoder.hidden_size,
                                                config.n_mlp_arc,
                                                config.n_mlp_rel,
                                                config.mlp_dropout,
                                                config.n_rels)

    def forward(self,
                words=None,
                feats=None,
                input_ids=None,
                token_span=None,
                mask=None, lens=None, **kwargs):
        x, mask = self.encoder(words, feats, input_ids, token_span, mask, lens)
        s_arc, s_rel = self.biaffine_decoder(x, mask)

        return s_arc, s_rel


================================================
FILE: hanlp/components/parsers/biaffine/biaffine_sdp.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-28 15:12
import functools
from collections import Counter
from typing import Union, List

import torch
from torch import nn

from hanlp_common.constant import UNK
from hanlp.common.transform import TransformList
from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser
from hanlp_common.conll import CoNLLUWord, CoNLLSentence
from hanlp.datasets.parsing.semeval15 import unpack_deps_to_head_deprel, append_bos_to_form_pos
from hanlp.metrics.parsing.labeled_f1 import LabeledF1
from hanlp_common.util import merge_locals_kwargs


class BiaffineSemanticDependencyParser(BiaffineDependencyParser):
    def __init__(self) -> None:
        r"""Implementation of "Stanford's graph-based neural dependency parser at
        the conll 2017 shared task" (:cite:`dozat2017stanford`) and "Establishing Strong Baselines for the New Decade"
        (:cite:`he-choi-2019`).
        """
        super().__init__()

    def get_pad_dict(self):
        return {'arc': False}

    def build_metric(self, **kwargs):
        return LabeledF1()

    # noinspection PyMethodOverriding
    def build_dataset(self, data, transform=None):
        transforms = TransformList(functools.partial(append_bos_to_form_pos, pos_key='UPOS'),
                                   functools.partial(unpack_deps_to_head_deprel, pad_rel=self.config.pad_rel))
        if transform:
            transforms.append(transform)
        return super(BiaffineSemanticDependencyParser, self).build_dataset(data, transforms)

    def build_criterion(self, **kwargs):
        return nn.BCEWithLogitsLoss(), nn.CrossEntropyLoss()

    def feed_batch(self, batch):
        arc_scores, rel_scores, mask, puncts = super().feed_batch(batch)
        mask = self.convert_to_3d_mask(arc_scores, mask)
        puncts = self.convert_to_3d_puncts(puncts, mask)
        return arc_scores, rel_scores, mask, puncts

    @staticmethod
    def convert_to_3d_puncts(puncts, mask):
        if puncts is not None:
            puncts = puncts.unsqueeze(-1).expand_as(mask)
        return puncts

    @staticmethod
    def convert_to_3d_mask(arc_scores, mask):
        # 3d masks
        mask = mask.unsqueeze(-1).expand_as(arc_scores).clone()
        mask[:, :, 1:] = mask[:, :, 1:] & mask.transpose(1, 2)[:, :, 1:]  # Keep the 1st colum because it predicts root
        return mask

    def compute_loss(self, arc_scores, rel_scores, arcs, rels, mask: torch.BoolTensor, criterion, batch=None):
        bce, ce = criterion
        arc_scores, arcs = arc_scores[mask], arcs[mask]
        rel_scores, rels = rel_scores[mask], rels[mask]
        rel_scores, rels = rel_scores[arcs], rels[arcs]
        arc_loss = bce(arc_scores, arcs.to(torch.float))
        arc_loss_interpolation = self.config.get('arc_loss_interpolation', None)
        loss = arc_loss * arc_loss_interpolation if arc_loss_interpolation else arc_loss
        if len(rels):
            rel_loss = ce(rel_scores, rels)
            loss += (rel_loss * (1 - arc_loss_interpolation)) if arc_loss_interpolation else rel_loss
        if arc_loss_interpolation:
            loss *= 2
        return loss

    def cache_dataset(self, dataset, timer, training=False, logger=None):
        if not self.config.apply_constraint:
            return super(BiaffineSemanticDependencyParser, self).cache_dataset(dataset, timer, training)
        num_roots = Counter()
        no_zero_head = True
        root_rels = Counter()
        for each in dataset:
            if training:
                num_roots[sum([x[0] for x in each['arc']])] += 1
                no_zero_head &= all([x != '_' for x in each['DEPS']])
                head_is_root = [i for i in range(len(each['arc'])) if each['arc'][i][0]]
                if head_is_root:
                    for i in head_is_root:
                        root_rels[each['rel'][i][0]] += 1
            timer.log('Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]')
        if training:
            if self.config.single_root is None:
                self.config.single_root = len(num_roots) == 1 and num_roots.most_common()[0][0] == 1
            if self.config.no_zero_head is None:
                self.config.no_zero_head = no_zero_head
            root_rel = root_rels.most_common()[0][0]
            self.config.root_rel_id = self.vocabs['rel'].get_idx(root_rel)
            if logger:
                logger.info(f'Training set properties: [blue]single_root = {self.config.single_root}[/blue], '
                            f'[blue]no_zero_head = {no_zero_head}[/blue], '
                            f'[blue]root_rel = {root_rel}[/blue]')

    def decode(self, arc_scores, rel_scores, mask, batch=None):
        eye = torch.arange(0, arc_scores.size(1), device=arc_scores.device).view(1, 1, -1).expand(
            arc_scores.size(0), -1, -1)
        inf = float('inf')
        arc_scores.scatter_(dim=1, index=eye, value=-inf)

        if self.config.apply_constraint:
            if self.config.get('single_root', False):
                arc_scores[~mask] = -inf  # the biaffine decoder doesn't apply 3d mask for now
                root_mask = arc_scores[:, :, 0].argmax(dim=-1).unsqueeze_(-1).expand_as(arc_scores[:, :, 0])
                arc_scores[:, :, 0] = -inf
                arc_scores[:, :, 0].scatter_(dim=-1, index=root_mask, value=inf)

            root_rel_id = self.config.root_rel_id
            rel_scores[:, :, 0, root_rel_id] = inf
            rel_scores[:, :, 1:, root_rel_id] = -inf

            arc_scores_T = arc_scores.transpose(-1, -2)
            arc = ((arc_scores > 0) & (arc_scores_T < arc_scores))
            if self.config.get('no_zero_head', False):
                arc_scores_T[arc] = -inf  # avoid cycle between a pair of nodes
                arc_scores_fix = arc_scores_T.argmax(dim=-2).unsqueeze_(-1).expand_as(arc_scores)
                arc.scatter_(dim=-1, index=arc_scores_fix, value=True)
        else:
            arc = arc_scores > 0
        rel = rel_scores.argmax(dim=-1)
        return arc, rel

    def collect_outputs_extend(self, predictions, arc_preds, rel_preds, lens, mask):
        predictions.extend(zip(arc_preds.tolist(), rel_preds.tolist(), mask.tolist()))
        # all_arcs.extend(seq.tolist() for seq in arc_preds[mask].split([x * x for x in lens]))
        # all_rels.extend(seq.tolist() for seq in rel_preds[mask].split([x * x for x in lens]))

    def predictions_to_human(self, predictions, outputs, data, use_pos, conll=True):
        for d, (arcs, rels, masks) in zip(data, predictions):
            sent = CoNLLSentence()
            for idx, (cell, a, r) in enumerate(zip(d, arcs[1:], rels[1:])):
                if use_pos:
                    token, pos = cell
                else:
                    token, pos = cell, None
                heads = [i for i in range(len(d) + 1) if a[i]]
                deprels = [self.vocabs['rel'][r[i]] for i in range(len(d) + 1) if a[i]]
                sent.append(
                    CoNLLUWord(idx + 1, token, upos=pos, head=None, deprel=None, deps=list(zip(heads, deprels))))
            outputs.append(sent)

    def fit(self, trn_data, dev_data, save_dir,
            feat=None,
            n_embed=100,
            pretrained_embed=None,
            transformer=None,
            average_subwords=False,
            word_dropout: float = 0.2,
            transformer_hidden_dropout=None,
            layer_dropout=0,
            mix_embedding: int = None,
            embed_dropout=.33,
            n_lstm_hidden=400,
            n_lstm_layers=3,
            hidden_dropout=.33,
            n_mlp_arc=500,
            n_mlp_rel=100,
            mlp_dropout=.33,
            arc_dropout=None,
            rel_dropout=None,
            arc_loss_interpolation=0.4,
            lr=2e-3,
            transformer_lr=5e-5,
            mu=.9,
            nu=.9,
            epsilon=1e-12,
            clip=5.0,
            decay=.75,
            decay_steps=5000,
            weight_decay=0,
            warmup_steps=0.1,
            separate_optimizer=True,
            patience=100,
            batch_size=None,
            sampler_builder=None,
            lowercase=False,
            epochs=50000,
            apply_constraint=False,
            single_root=None,
            no_zero_head=None,
            punct=False,
            min_freq=2,
            logger=None,
            verbose=True,
            unk=UNK,
            pad_rel=None,
            max_sequence_length=512,
            gradient_accumulation=1,
            devices: Union[float, int, List[int]] = None,
            transform=None,
            **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))


================================================
FILE: hanlp/components/parsers/biaffine/mlp.py
================================================
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


import torch.nn as nn

from hanlp.layers.dropout import SharedDropout


class MLP(nn.Module):
    r"""
    Applies a linear transformation together with a non-linear activation to the incoming tensor:
    :math:`y = \mathrm{Activation}(x A^T + b)`

    Args:
        n_in (~torch.Tensor):
            The size of each input feature.
        n_out (~torch.Tensor):
            The size of each output feature.
        dropout (float):
            If non-zero, introduce a :class:`SharedDropout` layer on the output with this dropout ratio. Default: 0.
        activation (bool):
            Whether to use activations. Default: True.
    """

    def __init__(self, n_in, n_out, dropout=0, activation=True):
        super().__init__()

        self.n_in = n_in
        self.n_out = n_out
        self.linear = nn.Linear(n_in, n_out)
        self.activation = nn.LeakyReLU(negative_slope=0.1) if activation else nn.Identity()
        self.dropout = SharedDropout(p=dropout)

        self.reset_parameters()

    def __repr__(self):
        s = f"n_in={self.n_in}, n_out={self.n_out}"
        if self.dropout.p > 0:
            s += f", dropout={self.dropout.p}"

        return f"{self.__class__.__name__}({s})"

    def reset_parameters(self):
        nn.init.orthogonal_(self.linear.weight)
        nn.init.zeros_(self.linear.bias)

    def forward(self, x):
        r"""
        Args:
            x (~torch.Tensor):
                The size of each input feature is `n_in`.

        Returns:
            A tensor with the size of each output feature `n_out`.
        """

        x = self.linear(x)
        x = self.activation(x)
        x = self.dropout(x)

        return x


================================================
FILE: hanlp/components/parsers/biaffine/structual_attention.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-26 10:40
from typing import Union, List

import torch
import torch.nn.functional as F
from hanlp.utils.torch_util import lengths_to_mask
from torch import nn

from hanlp.common.torch_component import TorchComponent
from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser
from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDecoder
from hanlp.layers.transformers.encoder import TransformerEncoder
from hanlp.layers.transformers.pt_imports import PreTrainedModel, PreTrainedTokenizer
from hanlp.metrics.accuracy import CategoricalAccuracy
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
from hanlp_common.util import merge_locals_kwargs


class StructuralAttentionLayer(nn.Module):

    def __init__(self, hidden_size, n_mlp_arc, n_mlp_rel, mlp_dropout, n_rels, projeciton=None) -> None:
        super().__init__()
        self.biaffine = BiaffineDecoder(hidden_size, n_mlp_arc, n_mlp_rel, mlp_dropout, n_rels)
        if projeciton:
            self.projection = nn.Linear(hidden_size, projeciton)
            hidden_size = projeciton
        else:
            self.projection = None
        self.head_WV = nn.Parameter(torch.randn(n_rels, hidden_size, hidden_size))
        self.dense = nn.Linear(hidden_size * n_rels, hidden_size)
        self.activation = nn.GELU()

    def forward(self, x, mask):
        s_arc, s_rel = self.biaffine(x, mask)
        p_arc = F.softmax(s_arc, dim=-1) * mask.unsqueeze(-1)
        p_rel = F.softmax(s_rel, -1)
        A = p_arc.unsqueeze(-1) * p_rel
        if self.projection:
            x = self.projection(x)
        Ax = torch.einsum('bijk,bih->bihk', A, x)
        AxW = torch.einsum('bihk,khm->bihk', Ax, self.head_WV)
        AxW = AxW.flatten(2)
        x = self.dense(AxW)
        x = self.activation(x)
        return s_arc, s_rel, x


class StructuralAttentionModel(nn.Module):
    def __init__(self,
                 config,
                 transformer: PreTrainedModel = None,
                 transformer_tokenizer: PreTrainedTokenizer = None
                 ) -> None:
        super().__init__()
        self.encoder = TransformerEncoder(transformer,
                                          transformer_tokenizer,
                                          config.average_subwords,
                                          config.scalar_mix,
                                          None,  # No word_dropout since SA is predicting masked tokens
                                          config.transformer_hidden_dropout,
                                          config.layer_dropout,
                                          config.max_sequence_length)
        hidden_size = transformer.config.hidden_size
        self.sa = StructuralAttentionLayer(hidden_size,
                                           config.n_mlp_arc,
                                           config.n_mlp_rel,
                                           config.mlp_dropout,
                                           config.n_rels,
                                           config.projection
                                           )
        if config.projection:
            hidden_size = config.projection
        self.mlm = nn.Linear(hidden_size, transformer_tokenizer.vocab_size)

    def forward(self,
                input_ids: torch.LongTensor,
                attention_mask=None,
                token_type_ids=None,
                token_span=None,
                mask=None,
                batch=None,
                **kwargs):
        h = self.encoder(input_ids, attention_mask, token_type_ids, token_span)
        s_arc, s_rel, h = self.sa(h, mask)
        x = self.mlm(h)
        return s_arc, s_rel, x


class MaskedTokenGenerator(object):

    def __init__(self, transformer_tokenizer: PreTrainedTokenizer, mask_prob=0.15) -> None:
        super().__init__()
        self.mask_prob = mask_prob
        self.transformer_tokenizer = transformer_tokenizer
        self.oov = transformer_tokenizer.mask_token_id
        self.pad = transformer_tokenizer.pad_token_id
        self.cls = transformer_tokenizer.cls_token_id
        self.sep = transformer_tokenizer.sep_token_id
        self.excludes = [self.pad, self.cls, self.sep]

    def __call__(self, tokens: torch.LongTensor, prefix_mask: torch.LongTensor):
        padding_mask = tokens.new_ones(tokens.size(), dtype=torch.bool)
        for pad in self.excludes:
            padding_mask &= (tokens != pad)
        padding_mask &= prefix_mask  # Only mask prefixes since the others won't be attended
        # Create a uniformly random mask selecting either the original words or OOV tokens
        dropout_mask = (tokens.new_empty(tokens.size(), dtype=torch.float).uniform_() < self.mask_prob)
        oov_mask = dropout_mask & padding_mask

        oov_fill = tokens.new_empty(tokens.size(), dtype=torch.long).fill_(self.oov)

        result = torch.where(oov_mask, oov_fill, tokens)
        return result, oov_mask


class StructuralAttentionParser(BiaffineDependencyParser):
    def __init__(self) -> None:
        super().__init__()
        self.model: StructuralAttentionModel = None
        self.mlm_generator: MaskedTokenGenerator = None

    def build_model(self, training=True, **kwargs) -> torch.nn.Module:
        transformer = TransformerEncoder.build_transformer(config=self.config, training=training)
        model = StructuralAttentionModel(self.config, transformer, self.transformer_tokenizer)
        return model

    def fit(self, trn_data, dev_data, save_dir,
            transformer=None,
            mask_prob=0.15,
            projection=None,
            average_subwords=False,
            transformer_hidden_dropout=None,
            layer_dropout=0,
            mix_embedding: int = None,
            embed_dropout=.33,
            n_mlp_arc=500,
            n_mlp_rel=100,
            mlp_dropout=.33,
            lr=2e-3,
            transformer_lr=5e-5,
            mu=.9,
            nu=.9,
            epsilon=1e-12,
            clip=5.0,
            decay=.75,
            decay_steps=5000,
            patience=100,
            sampler='kmeans',
            n_buckets=32,
            batch_max_tokens=5000,
            batch_size=None,
            epochs=50000,
            tree=False,
            punct=False,
            logger=None,
            verbose=True,
            max_sequence_length=512,
            devices: Union[float, int, List[int]] = None,
            transform=None,
            **kwargs):
        return TorchComponent.fit(self, **merge_locals_kwargs(locals(), kwargs))

    def feed_batch(self, batch):
        if self.model.training:
            input_ids = batch['input_ids']
            prefix_mask = batch['prefix_mask']
            batch['gold_input_ids'] = input_ids
            batch['input_ids'], batch['input_ids_mask'] = self.mlm_generator(input_ids, prefix_mask)
        words, feats, lens, puncts = batch.get('token_id', None), batch.get('pos_id', None), batch['sent_length'], \
                                     batch.get('punct_mask', None)
        mask = lengths_to_mask(lens)
        arc_scores, rel_scores, pred_input_ids = self.model(words=words, feats=feats, mask=mask, batch=batch, **batch)
        batch['pred_input_ids'] = pred_input_ids
        # ignore the first token of each sentence
        # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
        if self.model.training:
            mask = mask.clone()
        mask[:, 0] = 0
        return arc_scores, rel_scores, mask, puncts

    def on_config_ready(self, **kwargs):
        super().on_config_ready(**kwargs)
        self.mlm_generator = MaskedTokenGenerator(self.transformer_tokenizer, self.config.mask_prob)

    def compute_loss(self, arc_scores, rel_scores, arcs, rels, mask, criterion, batch=None):
        parse_loss = BiaffineDependencyParser.compute_loss(self, arc_scores, rel_scores, arcs, rels, mask, criterion, batch)
        if self.model.training:
            gold_input_ids = batch['gold_input_ids']
            pred_input_ids = batch['pred_input_ids']
            input_ids_mask = batch['input_ids_mask']
            token_span = batch['token_span']
            gold_input_ids = batch['gold_input_ids'] = gold_input_ids.gather(1, token_span[:, :, 0])
            input_ids_mask = batch['input_ids_mask'] = input_ids_mask.gather(1, token_span[:, :, 0])
            mlm_loss = F.cross_entropy(pred_input_ids[input_ids_mask], gold_input_ids[input_ids_mask])
            loss = parse_loss + mlm_loss
            return loss
        return parse_loss

    def build_tokenizer_transform(self):
        return TransformerSequenceTokenizer(self.transformer_tokenizer, 'token', '', ret_prefix_mask=True,
                                            ret_token_span=True, cls_is_bos=True,
                                            max_seq_length=self.config.get('max_sequence_length',
                                                                           512),
                                            truncate_long_sequences=False)

    def build_metric(self, training=None, **kwargs):
        parse_metric = super().build_metric(**kwargs)
        if training:
            mlm_metric = CategoricalAccuracy()
            return parse_metric, mlm_metric
        return parse_metric

    def update_metric(self, arc_scores, rel_scores, arcs, rels, mask, puncts, metric, batch=None):
        if isinstance(metric, tuple):
            parse_metric, mlm_metric = metric
            super().update_metric(arc_scores, rel_scores, arcs, rels, mask, puncts, parse_metric)
            gold_input_ids = batch['gold_input_ids']
            input_ids_mask = batch['input_ids_mask']
            pred_input_ids = batch['pred_input_ids']
            pred_input_ids = pred_input_ids[input_ids_mask]
            gold_input_ids = gold_input_ids[input_ids_mask]
            if len(pred_input_ids):
                mlm_metric(pred_input_ids, gold_input_ids)
        else:
            super().update_metric(arc_scores, rel_scores, arcs, rels, mask, puncts, metric)

    def _report(self, loss, metric):
        if isinstance(metric, tuple):
            parse_metric, mlm_metric = metric
            return super()._report(loss, parse_metric) + f' {mlm_metric}'
        else:
            return super()._report(loss, metric)


================================================
FILE: hanlp/components/parsers/biaffine/variationalbilstm.py
================================================
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


import torch
import torch.nn as nn
from torch.nn.modules.rnn import apply_permutation
from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence, pad_packed_sequence

from hanlp.common.structure import ConfigTracker
from hanlp.layers.dropout import SharedDropout


class VariationalLSTM(nn.Module):
    r"""
    LSTM is an variant of the vanilla bidirectional LSTM adopted by Biaffine Parser
    with the only difference of the dropout strategy.
    It drops nodes in the LSTM layers (input and recurrent connections)
    and applies the same dropout mask at every recurrent timesteps.

    APIs are roughly the same as :class:`~torch.nn.LSTM` except that we only allows
    :class:`~torch.nn.utils.rnn.PackedSequence` as input.

    References:
        - Timothy Dozat and Christopher D. Manning. 2017.
          `Deep Biaffine Attention for Neural Dependency Parsing`_.

    Args:
        input_size (int):
            The number of expected features in the input.
        hidden_size (int):
            The number of features in the hidden state `h`.
        num_layers (int):
            The number of recurrent layers. Default: 1.
        bidirectional (bool):
            If ``True``, becomes a bidirectional LSTM. Default: ``False``
        dropout (float):
            If non-zero, introduces a :class:`SharedDropout` layer on the outputs of each LSTM layer except the last layer.
            Default: 0.

    .. _Deep Biaffine Attention for Neural Dependency Parsing:
        https://openreview.net/forum?id=Hk95PK9le
    """

    def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=False, dropout=0):
        super().__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.dropout = dropout
        self.num_directions = 1 + self.bidirectional

        self.f_cells = nn.ModuleList()
        if bidirectional:
            self.b_cells = nn.ModuleList()
        for _ in range(self.num_layers):
            self.f_cells.append(nn.LSTMCell(input_size=input_size, hidden_size=hidden_size))
            if bidirectional:
                self.b_cells.append(nn.LSTMCell(input_size=input_size, hidden_size=hidden_size))
            input_size = hidden_size * self.num_directions

        self.reset_parameters()

    def __repr__(self):
        s = f"{self.input_size}, {self.hidden_size}"
        if self.num_layers > 1:
            s += f", num_layers={self.num_layers}"
        if self.bidirectional:
            s += f", bidirectional={self.bidirectional}"
        if self.dropout > 0:
            s += f", dropout={self.dropout}"

        return f"{self.__class__.__name__}({s})"

    def reset_parameters(self):
        for param in self.parameters():
            # apply orthogonal_ to weight
            if len(param.shape) > 1:
                nn.init.orthogonal_(param)
            # apply zeros_ to bias
            else:
                nn.init.zeros_(param)

    def permute_hidden(self, hx, permutation):
        if permutation is None:
            return hx
        h = apply_permutation(hx[0], permutation)
        c = apply_permutation(hx[1], permutation)

        return h, c

    def layer_forward(self, x, hx, cell, batch_sizes, reverse=False):
        hx_0 = hx_i = hx
        hx_n, output = [], []
        steps = reversed(range(len(x))) if reverse else range(len(x))
        if self.training:
            hid_mask = SharedDropout.get_mask(hx_0[0], self.dropout)

        for t in steps:
            last_batch_size, batch_size = len(hx_i[0]), batch_sizes[t]
            if last_batch_size < batch_size:
                hx_i = [torch.cat((h, ih[last_batch_size:batch_size])) for h, ih in zip(hx_i, hx_0)]
            else:
                hx_n.append([h[batch_size:] for h in hx_i])
                hx_i = [h[:batch_size] for h in hx_i]
            hx_i = [h for h in cell(x[t], hx_i)]
            output.append(hx_i[0])
            if self.training:
                hx_i[0] = hx_i[0] * hid_mask[:batch_size]
        if reverse:
            hx_n = hx_i
            output.reverse()
        else:
            hx_n.append(hx_i)
            hx_n = [torch.cat(h) for h in zip(*reversed(hx_n))]
        output = torch.cat(output)

        return output, hx_n

    def forward(self, sequence, hx=None):
        r"""
        Args:
            sequence (~torch.nn.utils.rnn.PackedSequence):
                A packed variable length sequence.
            hx (~torch.Tensor, ~torch.Tensor):
                A tuple composed of two tensors `h` and `c`.
                `h` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the initial hidden state
                for each element in the batch.
                `c` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the initial cell state
                for each element in the batch.
                If `hx` is not provided, both `h` and `c` default to zero.
                Default: ``None``.

        Returns:
            ~torch.nn.utils.rnn.PackedSequence, (~torch.Tensor, ~torch.Tensor):
                The first is a packed variable length sequence.
                The second is a tuple of tensors `h` and `c`.
                `h` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the hidden state for `t=seq_len`.
                Like output, the layers can be separated using ``h.view(num_layers, num_directions, batch_size, hidden_size)``
                and similarly for c.
                `c` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the cell state for `t=seq_len`.
        """
        x, batch_sizes = sequence.data, sequence.batch_sizes.tolist()
        batch_size = batch_sizes[0]
        h_n, c_n = [], []

        if hx is None:
            ih = x.new_zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size)
            h, c = ih, ih
        else:
            h, c = self.permute_hidden(hx, sequence.sorted_indices)
        h = h.view(self.num_layers, self.num_directions, batch_size, self.hidden_size)
        c = c.view(self.num_layers, self.num_directions, batch_size, self.hidden_size)

        for i in range(self.num_layers):
            x = torch.split(x, batch_sizes)
            if self.training:
                mask = SharedDropout.get_mask(x[0], self.dropout)
                x = [i * mask[:len(i)] for i in x]
            x_i, (h_i, c_i) = self.layer_forward(x=x,
                                                 hx=(h[i, 0], c[i, 0]),
                                                 cell=self.f_cells[i],
                                                 batch_sizes=batch_sizes)
            if self.bidirectional:
                x_b, (h_b, c_b) = self.layer_forward(x=x,
                                                     hx=(h[i, 1], c[i, 1]),
                                                     cell=self.b_cells[i],
                                                     batch_sizes=batch_sizes,
                                                     reverse=True)
                x_i = torch.cat((x_i, x_b), -1)
                h_i = torch.stack((h_i, h_b))
                c_i = torch.stack((c_i, c_b))
            x = x_i
            h_n.append(h_i)
            c_n.append(h_i)

        x = PackedSequence(x,
                           sequence.batch_sizes,
                           sequence.sorted_indices,
                           sequence.unsorted_indices)
        hx = torch.cat(h_n, 0), torch.cat(c_n, 0)
        hx = self.permute_hidden(hx, sequence.unsorted_indices)

        return x, hx


class VariationalLSTMEncoder(VariationalLSTM, ConfigTracker):
    def __init__(self,
                 input_size,
                 hidden_size,
                 num_layers=1,
                 bidirectional=False,
                 variational_dropout=0,
                 word_dropout=0,
                 ):
        super().__init__(input_size, hidden_size, num_layers, bidirectional, variational_dropout)
        ConfigTracker.__init__(self, locals())
        self.lstm_dropout = SharedDropout(p=word_dropout)

    # noinspection PyMethodOverriding
    def forward(self, embed, mask):
        batch_size, seq_len = mask.shape
        x = pack_padded_sequence(embed, mask.sum(1), True, False)
        x, _ = super().forward(x)
        x, _ = pad_packed_sequence(x, True, total_length=seq_len)
        x = self.lstm_dropout(x)
        return x

    def get_output_dim(self):
        return self.hidden_size * self.num_directions


================================================
FILE: hanlp/components/parsers/biaffine_parser_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-22 12:47
import logging
import math
import os
from typing import List
import numpy as np
import tensorflow as tf

from hanlp.components.parsers.parse_alg import unique_root, adjust_root_score, chu_liu_edmonds
from hanlp.layers.transformers.loader_tf import build_transformer

from hanlp.common.keras_component import KerasComponent
from hanlp.components.parsers.alg_tf import tarjan
from hanlp.components.parsers.biaffine_tf.model import BiaffineModelTF
from hanlp.transform.conll_tf import CoNLL_DEP_Transform, CoNLL_Transformer_Transform, CoNLL_SDP_Transform
from hanlp.layers.embeddings.util_tf import build_embedding
from hanlp.layers.transformers.tf_imports import PreTrainedTokenizer, TFAutoModel, TFPreTrainedModel, AutoTokenizer, \
    TFAutoModelWithLMHead, BertTokenizerFast, AlbertConfig, BertTokenizer, TFBertModel
from hanlp.layers.transformers.utils_tf import build_adamw_optimizer
from hanlp.metrics.parsing.labeled_f1_tf import LabeledF1TF
from hanlp.metrics.parsing.labeled_score import LabeledScore
from hanlp_common.util import merge_locals_kwargs


class BiaffineDependencyParserTF(KerasComponent):
    def __init__(self, transform: CoNLL_DEP_Transform = None) -> None:
        if not transform:
            transform = CoNLL_DEP_Transform()
        super().__init__(transform)
        self.transform: CoNLL_DEP_Transform = transform
        self.model: BiaffineModelTF = None

    def build_model(self, pretrained_embed, n_embed, training, **kwargs) -> tf.keras.Model:
        if training:
            self.config.n_words = len(self.transform.form_vocab)
        else:
            self.config.lstm_dropout = 0.  # keras will use cuda lstm when config.lstm_dropout is 0
        self.config.n_feats = len(self.transform.cpos_vocab)
        self._init_config()
        pretrained: tf.keras.layers.Embedding = build_embedding(pretrained_embed, self.transform.form_vocab,
                                                                self.transform) if pretrained_embed else None
        if pretrained_embed:
            self.config.n_embed = pretrained.output_dim
        model = BiaffineModelTF(self.config, pretrained)
        return model

    def _init_config(self):
        self.config.n_rels = len(self.transform.rel_vocab)
        self.config.pad_index = self.transform.form_vocab.pad_idx
        self.config.unk_index = self.transform.form_vocab.unk_idx
        self.config.bos_index = 2

    def load_weights(self, save_dir, filename='model.h5', functional=False, **kwargs):
        super().load_weights(save_dir, filename)
        if functional:
            self.model = self.model.to_functional()

    def fit(self, trn_data, dev_data, save_dir,
            n_embed=100,
            pretrained_embed=None,
            embed_dropout=.33,
            n_lstm_hidden=400,
            n_lstm_layers=3,
            lstm_dropout=.33,
            n_mlp_arc=500,
            n_mlp_rel=100,
            mlp_dropout=.33,
            optimizer='adam',
            lr=2e-3,
            mu=.9,
            nu=.9,
            epsilon=1e-12,
            clip=5.0,
            decay=.75,
            decay_steps=5000,
            patience=100,
            arc_loss='sparse_categorical_crossentropy',
            rel_loss='sparse_categorical_crossentropy',
            metrics=('UAS', 'LAS'),
            n_buckets=32,
            batch_size=5000,
            epochs=50000,
            early_stopping_patience=100,
            tree=False,
            punct=False,
            min_freq=2,
            run_eagerly=False, logger=None, verbose=True,
            **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    # noinspection PyMethodOverriding
    def train_loop(self, trn_data, dev_data, epochs, num_examples,
                   train_steps_per_epoch, dev_steps, model, optimizer, loss, metrics,
                   callbacks, logger: logging.Logger, arc_loss, rel_loss,
                   **kwargs):
        arc_loss, rel_loss = loss
        # because we are customizing batching
        train_steps_per_epoch = len(list(iter(trn_data)))
        # progbar: tf.keras.callbacks.ProgbarLogger = callbacks[-1]
        c: tf.keras.callbacks.Callback = None
        metric = self._build_metrics()
        for c in callbacks:
            if not hasattr(c, 'params'):
                c.params = dict()
            c.params['epochs'] = epochs
            c.params['trn_data'] = trn_data
            c.params['metrics'] = ['loss'] + self.config.metrics
            c.params['metrics'] = c.params['metrics'] + [f'val_{k}' for k in c.params['metrics']]
            c.on_train_begin()
        for epoch in range(epochs):
            metric.reset_states()
            for c in callbacks:
                c.params['steps'] = train_steps_per_epoch
                c.on_epoch_begin(epoch)
            for idx, ((words, feats), (arcs, rels)) in enumerate(iter(trn_data)):
                logs = {}
                for c in callbacks:
                    c.on_batch_begin(idx, logs)
                mask = tf.not_equal(words, self.config.pad_index) & tf.not_equal(words, self.config.bos_index)
                loss, arc_scores, rel_scores = self.train_batch(words, feats, arcs, rels, mask,
                                                                optimizer, arc_loss, rel_loss)
                self.run_metrics(arcs, rels, arc_scores, rel_scores, words, mask, metric)
                logs['loss'] = loss
                logs.update(metric.to_dict())
                if epoch == epochs - 1:
                    self.model.stop_training = True
                for c in callbacks:
                    c.on_batch_end(idx, logs)
            # evaluate on dev
            metric.reset_states()
            logs = {}
            for idx, ((words, feats), (arcs, rels)) in enumerate(iter(dev_data)):
                arc_scores, rel_scores, loss, mask, arc_preds, rel_preds = self.evaluate_batch(words, feats, arcs, rels,
                                                                                               arc_loss, rel_loss,
                                                                                               metric)
                logs['val_loss'] = loss
                logs.update((f'val_{k}', v) for k, v in metric.to_dict().items())

            for c in callbacks:
                c.on_epoch_end(epoch, logs)
            if getattr(self.model, 'stop_training', None):
                break

        for c in callbacks:
            c.on_train_end()

    def evaluate(self, input_path: str, save_dir=None, output=False, batch_size=None, logger: logging.Logger = None,
                 callbacks: List[tf.keras.callbacks.Callback] = None, warm_up=False, verbose=True, **kwargs):
        if batch_size is None:
            batch_size = self.config.batch_size
        return super().evaluate(input_path, save_dir, output, batch_size, logger, callbacks, warm_up, verbose, **kwargs)

    def evaluate_batch(self, words, feats, arcs, rels, arc_loss, rel_loss, metric):
        mask = tf.not_equal(words, self.config.pad_index) & tf.not_equal(words, self.config.bos_index)
        arc_scores, rel_scores = self.model((words, feats))
        loss = self.get_loss(arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss)
        arc_preds, rel_preds = self.run_metrics(arcs, rels, arc_scores, rel_scores, words, mask, metric)
        return arc_scores, rel_scores, loss, mask, arc_preds, rel_preds

    def _build_metrics(self):
        if isinstance(self.config.metrics, tuple):
            self.config.metrics = list(self.config.metrics)
        if self.config.metrics == ['UAS', 'LAS']:
            metric = LabeledScore()
        else:
            metric = LabeledF1TF()
        return metric

    def run_metrics(self, arcs, rels, arc_scores, rel_scores, words, mask, metric):
        arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask)
        # ignore all punctuation if not specified
        if not self.config.punct:
            mask &= tf.reduce_all(tf.not_equal(tf.expand_dims(words, axis=-1), self.transform.puncts), axis=-1)
        metric(arc_preds, rel_preds, arcs, rels, mask)
        return arc_preds, rel_preds

    def train_batch(self, words, feats, arcs, rels, mask, optimizer, arc_loss, rel_loss):
        with tf.GradientTape() as tape:
            arc_scores, rel_scores = self.model((words, feats), training=True)
            loss = self.get_loss(arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss)
        grads = tape.gradient(loss, self.model.trainable_variables)
        optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss, arc_scores, rel_scores

    def get_loss(self, arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss):
        arc_scores, arcs = arc_scores[mask], arcs[mask]
        rel_scores, rels = rel_scores[mask], rels[mask]
        rel_scores = tf.gather_nd(rel_scores, tf.stack([tf.range(len(arcs), dtype=tf.int64), arcs], axis=1))
        arc_loss = arc_loss(arcs, arc_scores)
        rel_loss = rel_loss(rels, rel_scores)
        loss = arc_loss + rel_loss

        return loss

    def build_optimizer(self, optimizer='adam', lr=2e-3, mu=.9, nu=.9, epsilon=1e-12, clip=5.0, decay=.75,
                        decay_steps=5000, **kwargs):
        if optimizer == 'adam':
            scheduler = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=lr,
                                                                       decay_steps=decay_steps,
                                                                       decay_rate=decay)
            from hanlp.optimizers.adamw.optimization import AdamTF
            optimizer = AdamTF(learning_rate=scheduler,
                               beta_1=mu,
                               beta_2=nu,
                               epsilon=epsilon,
                               clipnorm=clip)
            return optimizer
        return super().build_optimizer(optimizer, **kwargs)

    # noinspection PyMethodOverriding
    def build_loss(self, arc_loss, rel_loss, **kwargs):
        if arc_loss == 'binary_crossentropy':
            arc_loss = tf.losses.BinaryCrossentropy(from_logits=True)
        else:
            arc_loss = tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=True) if arc_loss == 'sparse_categorical_crossentropy' else super().build_loss(arc_loss)
        rel_loss = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True) if rel_loss == 'sparse_categorical_crossentropy' else super().build_loss(rel_loss)
        return arc_loss, rel_loss

    @property
    def sample_data(self):
        return tf.constant([[2, 3, 4], [2, 5, 0]], dtype=tf.int64), tf.constant([[1, 2, 3], [4, 5, 0]], dtype=tf.int64)

    def num_samples_in(self, dataset):
        return sum(len(x[0][0]) for x in iter(dataset))

    def build_train_dataset(self, trn_data, batch_size, num_examples):
        trn_data = self.transform.file_to_dataset(trn_data, batch_size=batch_size,
                                                  shuffle=True,
                                                  repeat=None)
        return trn_data

    # noinspection PyMethodOverriding
    def build_callbacks(self, save_dir, logger, metrics, **kwargs):
        callbacks = super().build_callbacks(save_dir, logger, metrics=metrics, **kwargs)
        if isinstance(metrics, tuple):
            metrics = list(metrics)
        callbacks.append(self.build_progbar(metrics))
        params = {'verbose': 1, 'epochs': 1}
        for c in callbacks:
            c.set_params(params)
            c.set_model(self.model)
        return callbacks

    def build_progbar(self, metrics, training=True):
        return tf.keras.callbacks.ProgbarLogger(count_mode='steps',
                                                stateful_metrics=metrics + [f'val_{k}' for k in metrics] if training
                                                else [])

    def decode(self, arc_scores, rel_scores, mask):
        if self.config.tree:
            root_rel_idx = self.transform.root_rel_idx
            root_rel_onehot = np.eye(len(self.transform.rel_vocab))[root_rel_idx]
            arc_preds = np.zeros_like(mask, dtype=np.int64)
            rel_preds = np.zeros_like(mask, dtype=np.int64)
            for arc, rel, m, arc_pred, rel_pred in zip(arc_scores, rel_scores, mask, arc_preds, rel_preds):
                length = int(tf.math.count_nonzero(m)) + 1
                arc = arc[:length, :length]
                arc_probs = tf.nn.softmax(arc).numpy()
                m = np.expand_dims(m.numpy()[:length], -1)
                if self.config.tree == 'tarjan':
                    heads = tarjan(arc_probs, length, m)
                elif self.config.tree == 'mst':
                    heads, head_probs, tokens = unique_root(arc_probs, m, length)
                    arc = arc.numpy()
                    adjust_root_score(arc, heads, root_rel_idx)
                    heads = chu_liu_edmonds(arc, length)
                else:
                    raise ValueError(f'Unknown tree algorithm {self.config.tree}')
                arc_pred[:length] = heads
                root = np.where(heads[np.arange(1, length)] == 0)[0] + 1
                rel_prob = tf.nn.softmax(rel[:length, :length, :]).numpy()
                rel_prob = rel_prob[np.arange(length), heads]
                rel_prob[root] = root_rel_onehot
                rel_prob[np.arange(length) != root, np.arange(len(self.transform.rel_vocab)) == root_rel_idx] = 0
                # rels = rel_argmax(rel_prob, length, root_rel_idx)
                rels = np.argmax(rel_prob, axis=1)
                rel_pred[:length] = rels
            arc_preds = tf.constant(arc_preds)
            rel_preds = tf.constant(rel_preds)
        else:
            arc_preds = tf.argmax(arc_scores, -1)
            rel_preds = tf.argmax(rel_scores, -1)
            rel_preds = tf.squeeze(tf.gather(rel_preds, tf.expand_dims(arc_preds, -1), batch_dims=2), axis=-1)

        return arc_preds, rel_preds

    def evaluate_dataset(self, tst_data, callbacks, output, num_batches, ret_scores=None, **kwargs):
        if 'mask_p' in self.config:
            self.config['mask_p'] = None
        arc_loss, rel_loss = self.build_loss(**self.config)
        callbacks = [self.build_progbar(self.config['metrics'])]
        steps_per_epoch = len(list(iter(tst_data)))
        metric = self._build_metrics()
        params = {'verbose': 1, 'epochs': 1, 'metrics': ['loss'] + self.config.metrics, 'steps': steps_per_epoch}
        for c in callbacks:
            c.set_params(params)
            c.on_test_begin()
            c.on_epoch_end(0)
        logs = {}
        if ret_scores:
            scores = []
        if output:
            ext = os.path.splitext(output)[-1]
            output = open(output, 'w', encoding='utf-8')
        for idx, ((words, feats), Y) in enumerate(iter(tst_data)):
            arcs, rels = Y[0], Y[1]
            for c in callbacks:
                c.on_test_batch_begin(idx, logs)
            arc_scores, rel_scores, loss, mask, arc_preds, rel_preds = self.evaluate_batch(words, feats, arcs, rels,
                                                                                           arc_loss, rel_loss, metric)
            if ret_scores:
                scores.append((arc_scores.numpy(), rel_scores.numpy(), mask.numpy()))
            if output:
                for sent in self.transform.XY_to_inputs_outputs((words, feats, mask), (arc_preds, rel_preds),
                                                                conll=ext, arc_scores=arc_scores,
                                                                rel_scores=rel_scores):
                    output.write(str(sent))
                    output.write('\n\n')
            logs['loss'] = loss
            logs.update(metric.to_dict())
            for c in callbacks:
                c.on_test_batch_end(idx, logs)
        for c in callbacks:
            c.on_epoch_end(0)
            c.on_test_end()
        if output:
            output.close()
        loss = float(c.progbar._values['loss'][0] / c.progbar._values['loss'][1])
        outputs = loss, metric.to_dict(), False
        if ret_scores:
            outputs += (scores,)
        return outputs

    def predict_batch(self, batch, inputs=None, conll=True, **kwargs):
        ((words, feats), (arcs, rels)) = batch
        mask = tf.not_equal(words, self.config.pad_index) & tf.not_equal(words, self.config.bos_index)
        arc_scores, rel_scores = self.model((words, feats))
        arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask)
        for sent in self.transform.XY_to_inputs_outputs((words, feats, mask), (arc_preds, rel_preds), gold=False,
                                                        inputs=inputs, conll=conll):
            yield sent

    def compile_model(self, optimizer, loss, metrics):
        super().compile_model(optimizer, loss, metrics)


class BiaffineSemanticDependencyParserTF(BiaffineDependencyParserTF):
    def __init__(self, transform: CoNLL_SDP_Transform = None) -> None:
        if not transform:
            transform = CoNLL_SDP_Transform()
        # noinspection PyTypeChecker
        super().__init__(transform)
        self.transform: CoNLL_SDP_Transform = transform

    def fit(self, trn_data, dev_data, save_dir, n_embed=100, pretrained_embed=None, embed_dropout=.33,
            n_lstm_hidden=400, n_lstm_layers=3, lstm_dropout=.33, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33,
            optimizer='adam', lr=2e-3, mu=.9, nu=.9, epsilon=1e-12, clip=5.0, decay=.75, decay_steps=5000, patience=100,
            arc_loss='binary_crossentropy', rel_loss='sparse_categorical_crossentropy',
            metrics=('UF', 'LF'), n_buckets=32, batch_size=5000, epochs=50000, early_stopping_patience=100,
            tree=False, punct=False, min_freq=2, run_eagerly=False, logger=None, verbose=True, **kwargs):
        return super().fit(trn_data, dev_data, save_dir, n_embed, pretrained_embed, embed_dropout, n_lstm_hidden,
                           n_lstm_layers, lstm_dropout, n_mlp_arc, n_mlp_rel, mlp_dropout, optimizer, lr, mu, nu,
                           epsilon, clip, decay, decay_steps, patience, arc_loss, rel_loss, metrics, n_buckets,
                           batch_size, epochs, early_stopping_patience, tree, punct, min_freq, run_eagerly, logger,
                           verbose, **kwargs)

    def get_loss(self, arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss):
        mask = tf.tile(tf.expand_dims(mask, -1), [1, 1, tf.shape(mask)[-1]])
        mask &= tf.transpose(mask, [0, 2, 1])
        arc_scores, arcs = arc_scores[mask], arcs[mask]
        rel_scores, rels = rel_scores[mask], rels[mask]
        rel_scores, rels = rel_scores[arcs], rels[arcs]
        arc_loss = arc_loss(arcs, arc_scores)
        rel_loss = rel_loss(rels, rel_scores)
        loss = arc_loss + rel_loss

        return loss

    def decode(self, arc_scores, rel_scores, mask):
        arc_preds = arc_scores > 0
        rel_preds = tf.argmax(rel_scores, -1)

        return arc_preds, rel_preds


class BiaffineTransformerDependencyParserTF(BiaffineDependencyParserTF, tf.keras.callbacks.Callback):
    def __init__(self, transform: CoNLL_Transformer_Transform = None) -> None:
        if not transform:
            transform = CoNLL_Transformer_Transform()
        super().__init__(transform)
        self.transform: CoNLL_Transformer_Transform = transform

    def build_model(self, transformer, training, **kwargs) -> tf.keras.Model:
        transformer = self.build_transformer(training, transformer)
        model = BiaffineModelTF(self.config, transformer=transformer)
        return model

    def build_transformer(self, training, transformer):
        if training:
            self.config.n_words = len(self.transform.form_vocab)
        self._init_config()
        if isinstance(transformer, str):
            if 'albert_chinese' in transformer:
                tokenizer = BertTokenizerFast.from_pretrained(transformer, add_special_tokens=False)
                transformer: TFPreTrainedModel = TFAutoModel.from_pretrained(transformer, name=transformer,
                                                                             from_pt=True)
            elif transformer.startswith('albert') and transformer.endswith('zh'):
                transformer, tokenizer, path = build_transformer(transformer)
                transformer.config = AlbertConfig.from_json_file(os.path.join(path, "albert_config.json"))
                tokenizer = BertTokenizer.from_pretrained(os.path.join(path, "vocab_chinese.txt"),
                                                          add_special_tokens=False)
            elif 'chinese-roberta' in transformer:
                tokenizer = BertTokenizer.from_pretrained(transformer)
                transformer = TFBertModel.from_pretrained(transformer, name=transformer, from_pt=True)
            else:
                tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(transformer)
                try:
                    transformer: TFPreTrainedModel = TFAutoModel.from_pretrained(transformer, name=transformer)
                except (TypeError, OSError):
                    transformer: TFPreTrainedModel = TFAutoModel.from_pretrained(transformer, name=transformer,
                                                                                 from_pt=True)
        elif transformer[0] == 'AutoModelWithLMHead':
            tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(transformer[1])
            transformer: TFAutoModelWithLMHead = TFAutoModelWithLMHead.from_pretrained(transformer[1])
        else:
            raise ValueError(f'Unknown identifier {transformer}')
        self.transform.tokenizer = tokenizer
        if self.config.get('fp16', None) or self.config.get('use_amp', None):
            policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
            tf.keras.mixed_precision.experimental.set_policy(policy)
            # tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
            transformer.set_weights([w.astype('float16') for w in transformer.get_weights()])
        self.transform.transformer_config = transformer.config
        return transformer

    # noinspection PyMethodOverriding
    def fit(self, trn_data, dev_data, save_dir, transformer, max_seq_length=256, transformer_dropout=.33,
            d_positional=None,
            n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33,
            optimizer='adamw',
            learning_rate=5e-5,
            learning_rate_transformer=None,
            weight_decay_rate=0,
            epsilon=1e-8,
            clipnorm=None,
            fp16=False,
            warmup_steps_ratio=0,
            arc_loss='sparse_categorical_crossentropy', rel_loss='sparse_categorical_crossentropy',
            metrics=('UAS', 'LAS'),
            batch_size=3000,
            samples_per_batch=150,
            max_samples_per_batch=None,
            epochs=100,
            tree=False, punct=False, token_mapping=None, run_eagerly=False, logger=None, verbose=True, **kwargs):
        self.set_params({})
        return KerasComponent.fit(self, **merge_locals_kwargs(locals(), kwargs))

    @property
    def sample_data(self):
        dataset = self.transform.inputs_to_dataset(
            [[('Hello', 'NN'), ('world', 'NN')], [('HanLP', 'NN'), ('is', 'NN'), ('good', 'NN')]] if self.config.get(
                'use_pos', None) else
            [['Hello', 'world'], ['HanLP', 'is', 'good']])
        return next(iter(dataset))[0]

    # noinspection PyMethodOverriding
    def build_optimizer(self, optimizer, learning_rate, epsilon, weight_decay_rate, clipnorm, fp16, train_steps,
                        **kwargs):
        if optimizer == 'adamw':
            epochs = self.config['epochs']
            learning_rate_transformer = kwargs.get('learning_rate_transformer', None)
            train_steps = math.ceil(self.config.train_examples * epochs / self.config.samples_per_batch)
            warmup_steps = math.ceil(train_steps * self.config['warmup_steps_ratio'])
            if learning_rate_transformer is not None:
                if learning_rate_transformer > 0:
                    self.params['optimizer_transformer'] = build_adamw_optimizer(self.config, learning_rate_transformer,
                                                                                 epsilon,
                                                                                 clipnorm, train_steps, fp16,
                                                                                 math.ceil(warmup_steps),
                                                                                 weight_decay_rate)
                else:
                    self.model.transformer.trainable = False
                return super().build_optimizer(lr=learning_rate)  # use a normal adam for biaffine
            else:
                return build_adamw_optimizer(self.config, learning_rate, epsilon, clipnorm, train_steps, fp16,
                                             math.ceil(warmup_steps), weight_decay_rate)
        return super().build_optimizer(optimizer, **kwargs)

    def build_vocab(self, trn_data, logger):
        self.config.train_examples = train_examples = super().build_vocab(trn_data, logger)
        return train_examples

    def build_callbacks(self, save_dir, logger, metrics, **kwargs):
        callbacks = super().build_callbacks(save_dir, logger, metrics=metrics, **kwargs)
        callbacks.append(self)
        if not self.params:
            self.set_params({})
        return callbacks

    def on_train_begin(self):
        self.params['accum_grads'] = [tf.Variable(tf.zeros_like(tv.read_value()), trainable=False) for tv in
                                      self.model.trainable_variables]
        self.params['trained_samples'] = 0
        self.params['transformer_variable_names'] = {x.name for x in self.model.transformer.trainable_variables}

    def train_batch(self, words, feats, arcs, rels, mask, optimizer, arc_loss, rel_loss):
        with tf.GradientTape() as tape:
            arc_scores, rel_scores = self.model((words, feats), training=True)
            loss = self.get_loss(arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss)
        grads = tape.gradient(loss, self.model.trainable_variables)
        accum_grads = self.params['accum_grads']
        for i, grad in enumerate(grads):
            if grad is not None:
                accum_grads[i].assign_add(grad)
        self.params['trained_samples'] += tf.shape(words)[0]
        if self.params['trained_samples'] >= self.config.samples_per_batch:
            self._apply_grads(accum_grads)
        return loss, arc_scores, rel_scores

    def _apply_grads(self, accum_grads):
        optimizer_transformer = self.params.get('optimizer_transformer', None)
        if optimizer_transformer:
            transformer = self.params['transformer_variable_names']
            trainable_variables = self.model.trainable_variables
            optimizer_transformer.apply_gradients(
                (g, w) for g, w in zip(accum_grads, trainable_variables) if w.name in transformer)
            self.model.optimizer.apply_gradients(
                (g, w) for g, w in zip(accum_grads, trainable_variables) if w.name not in transformer)
        else:
            self.model.optimizer.apply_gradients(zip(accum_grads, self.model.trainable_variables))
        for tv in accum_grads:
            tv.assign(tf.zeros_like(tv))
        # print('Apply grads after', self.params['trained_samples'], 'samples')
        self.params['trained_samples'] = 0

    def on_epoch_end(self, epoch, logs=None):
        if self.params['trained_samples']:
            self._apply_grads(self.params['accum_grads'])


class BiaffineTransformerSemanticDependencyParser(BiaffineTransformerDependencyParserTF):

    def __init__(self, transform: CoNLL_Transformer_Transform = None) -> None:
        if not transform:
            transform = CoNLL_Transformer_Transform(graph=True)
        super().__init__(transform)

    def get_loss(self, arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss):
        return BiaffineSemanticDependencyParserTF.get_loss(self, arc_scores, rel_scores, arcs, rels, mask, arc_loss,
                                                           rel_loss)

    def fit(self, trn_data, dev_data, save_dir, transformer, max_seq_length=256, transformer_dropout=.33,
            d_positional=None, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, optimizer='adamw', learning_rate=5e-5,
            learning_rate_transformer=None, weight_decay_rate=0, epsilon=1e-8, clipnorm=None, fp16=False,
            warmup_steps_ratio=0, arc_loss='binary_crossentropy',
            rel_loss='sparse_categorical_crossentropy', metrics=('UF', 'LF'), batch_size=3000, samples_per_batch=150,
            max_samples_per_batch=None, epochs=100, tree=False, punct=False, token_mapping=None, enhanced_only=False,
            run_eagerly=False,
            logger=None, verbose=True, **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def decode(self, arc_scores, rel_scores, mask):
        return BiaffineSemanticDependencyParserTF.decode(self, arc_scores, rel_scores, mask)


================================================
FILE: hanlp/components/parsers/biaffine_tf/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-26 23:03

================================================
FILE: hanlp/components/parsers/biaffine_tf/alg.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-26 19:49
# Ported from the PyTorch implementation https://github.com/zysite/biaffine-parser
from typing import List
import numpy as np
import tensorflow as tf
from collections import defaultdict


def nonzero(t: tf.Tensor) -> tf.Tensor:
    return tf.where(t > 0)


def view(t: tf.Tensor, *dims) -> tf.Tensor:
    return tf.reshape(t, dims)


def arange(n: int) -> tf.Tensor:
    return tf.range(n)


def randperm(n: int) -> tf.Tensor:
    return tf.random.shuffle(arange(n))


def tolist(t: tf.Tensor) -> List:
    if isinstance(t, tf.Tensor):
        t = t.numpy()
    return t.tolist()


def kmeans(x, k, seed=None):
    """See https://github.com/zysite/biaffine-parser/blob/master/parser/utils/alg.py#L7

    Args:
      x(list): Lengths of sentences
      k(int): 
      seed:  (Default value = None)

    Returns:

    
    """
    x = tf.constant(x, dtype=tf.float32)
    # count the frequency of each datapoint
    d, indices, f = tf.unique_with_counts(x, tf.int32)
    f = tf.cast(f, tf.float32)
    # calculate the sum of the values of the same datapoints
    total = d * f
    # initialize k centroids randomly
    c, old = tf.random.shuffle(d, seed)[:k], None
    # assign labels to each datapoint based on centroids
    dists = tf.abs(tf.expand_dims(d, -1) - c)
    y = tf.argmin(dists, axis=-1, output_type=tf.int32)
    dists = tf.gather_nd(dists, tf.transpose(tf.stack([tf.range(tf.shape(dists)[0], dtype=tf.int32), y])))
    # make sure number of datapoints is greater than that of clusters
    assert len(d) >= k, f"unable to assign {len(d)} datapoints to {k} clusters"

    while old is None or not tf.reduce_all(c == old):
        # if an empty cluster is encountered,
        # choose the farthest datapoint from the biggest cluster
        # and move that the empty one
        for i in range(k):
            if not tf.reduce_any(y == i):
                mask = tf.cast(y == tf.expand_dims(tf.range(k, dtype=tf.int32), -1), tf.float32)
                lens = tf.reduce_sum(mask, axis=-1)
                biggest = view(nonzero(mask[tf.argmax(lens)]), -1)
                farthest = tf.argmax(tf.gather(dists, biggest))
                tf.tensor_scatter_nd_update(y, tf.expand_dims(tf.expand_dims(biggest[farthest], -1), -1), [i])
        mask = tf.cast(y == tf.expand_dims(tf.range(k, dtype=tf.int32), -1), tf.float32)
        # update the centroids
        c, old = tf.cast(tf.reduce_sum(total * mask, axis=-1), tf.float32) / tf.cast(tf.reduce_sum(f * mask, axis=-1),
                                                                                     tf.float32), c
        # re-assign all datapoints to clusters
        dists = tf.abs(tf.expand_dims(d, -1) - c)
        y = tf.argmin(dists, axis=-1, output_type=tf.int32)
        dists = tf.gather_nd(dists, tf.transpose(tf.stack([tf.range(tf.shape(dists)[0], dtype=tf.int32), y])))
    # assign all datapoints to the new-generated clusters
    # without considering the empty ones
    y, (assigned, _) = tf.gather(y, indices), tf.unique(y)
    # get the centroids of the assigned clusters
    centroids = tf.gather(c, assigned).numpy().tolist()
    # map all values of datapoints to buckets
    clusters = [tf.squeeze(tf.where(y == i), axis=-1).numpy().tolist() for i in assigned]

    return centroids, clusters


# ***************************************************************
class Tarjan:
    """Computes Tarjan's algorithm for finding strongly connected components (cycles) of a graph"""

    def __init__(self, prediction, tokens):
        """

        Parameters
        ----------
        prediction : numpy.ndarray
            a predicted dependency tree where prediction[dep_idx] = head_idx
        tokens : numpy.ndarray
            the tokens we care about (i.e. exclude _GO, _EOS, and _PAD)
        """
        self._edges = defaultdict(set)
        self._vertices = set((0,))
        for dep, head in enumerate(prediction[tokens]):
            self._vertices.add(dep + 1)
            self._edges[head].add(dep + 1)
        self._indices = {}
        self._lowlinks = {}
        self._onstack = defaultdict(lambda: False)
        self._SCCs = []

        index = 0
        stack = []
        for v in self.vertices:
            if v not in self.indices:
                self.strongconnect(v, index, stack)

    # =============================================================
    def strongconnect(self, v, index, stack):
        """

        Args:
          v: 
          index: 
          stack: 

        Returns:

        """

        self._indices[v] = index
        self._lowlinks[v] = index
        index += 1
        stack.append(v)
        self._onstack[v] = True
        for w in self.edges[v]:
            if w not in self.indices:
                self.strongconnect(w, index, stack)
                self._lowlinks[v] = min(self._lowlinks[v], self._lowlinks[w])
            elif self._onstack[w]:
                self._lowlinks[v] = min(self._lowlinks[v], self._indices[w])

        if self._lowlinks[v] == self._indices[v]:
            self._SCCs.append(set())
            while stack[-1] != v:
                w = stack.pop()
                self._onstack[w] = False
                self._SCCs[-1].add(w)
            w = stack.pop()
            self._onstack[w] = False
            self._SCCs[-1].add(w)
        return

    # ======================
    @property
    def edges(self):
        return self._edges

    @property
    def vertices(self):
        return self._vertices

    @property
    def indices(self):
        return self._indices

    @property
    def SCCs(self):
        return self._SCCs


def tarjan(parse_probs, length, tokens_to_keep, ensure_tree=True):
    """Adopted from Timothy Dozat https://github.com/tdozat/Parser/blob/master/lib/models/nn.py

    Args:
      parse_probs(NDArray): seq_len x seq_len, the probability of arcs
      length(NDArray): sentence length including ROOT
      tokens_to_keep(NDArray): mask matrix
      ensure_tree:  (Default value = True)

    Returns:

    
    """
    if ensure_tree:
        I = np.eye(len(tokens_to_keep))
        # block loops and pad heads
        parse_probs = parse_probs * tokens_to_keep * (1 - I)
        parse_preds = np.argmax(parse_probs, axis=1)
        tokens = np.arange(1, length)
        roots = np.where(parse_preds[tokens] == 0)[0] + 1
        # ensure at least one root
        if len(roots) < 1:
            # The current root probabilities
            root_probs = parse_probs[tokens, 0]
            # The current head probabilities
            old_head_probs = parse_probs[tokens, parse_preds[tokens]]
            # Get new potential root probabilities
            new_root_probs = root_probs / old_head_probs
            # Select the most probable root
            new_root = tokens[np.argmax(new_root_probs)]
            # Make the change
            parse_preds[new_root] = 0
        # ensure at most one root
        elif len(roots) > 1:
            # The probabilities of the current heads
            root_probs = parse_probs[roots, 0]
            # Set the probability of depending on the root zero
            parse_probs[roots, 0] = 0
            # Get new potential heads and their probabilities
            new_heads = np.argmax(parse_probs[roots][:, tokens], axis=1) + 1
            new_head_probs = parse_probs[roots, new_heads] / root_probs
            # Select the most probable root
            new_root = roots[np.argmin(new_head_probs)]
            # Make the change
            parse_preds[roots] = new_heads
            parse_preds[new_root] = 0
        # remove cycles
        tarjan = Tarjan(parse_preds, tokens)
        for SCC in tarjan.SCCs:
            if len(SCC) > 1:
                dependents = set()
                to_visit = set(SCC)
                while len(to_visit) > 0:
                    node = to_visit.pop()
                    if not node in dependents:
                        dependents.add(node)
                        to_visit.update(tarjan.edges[node])
                # The indices of the nodes that participate in the cycle
                cycle = np.array(list(SCC))
                # The probabilities of the current heads
                old_heads = parse_preds[cycle]
                old_head_probs = parse_probs[cycle, old_heads]
                # Set the probability of depending on a non-head to zero
                non_heads = np.array(list(dependents))
                parse_probs[np.repeat(cycle, len(non_heads)), np.repeat([non_heads], len(cycle), axis=0).flatten()] = 0
                # Get new potential heads and their probabilities
                new_heads = np.argmax(parse_probs[cycle][:, tokens], axis=1) + 1
                new_head_probs = parse_probs[cycle, new_heads] / old_head_probs
                # Select the most probable change
                change = np.argmax(new_head_probs)
                changed_cycle = cycle[change]
                old_head = old_heads[change]
                new_head = new_heads[change]
                # Make the change
                parse_preds[changed_cycle] = new_head
                tarjan.edges[new_head].add(changed_cycle)
                tarjan.edges[old_head].remove(changed_cycle)
        return parse_preds
    else:
        # block and pad heads
        parse_probs = parse_probs * tokens_to_keep
        parse_preds = np.argmax(parse_probs, axis=1)
        return parse_preds


def rel_argmax(rel_probs, length, root, ensure_tree=True):
    """Fix the relation prediction by heuristic rules

    Args:
      rel_probs(NDArray): seq_len x rel_size
      length: real sentence length
      ensure_tree:  (Default value = True)
      root: 

    Returns:

    
    """
    if ensure_tree:
        tokens = np.arange(1, length)
        rel_preds = np.argmax(rel_probs, axis=1)
        roots = np.where(rel_preds[tokens] == root)[0] + 1
        if len(roots) < 1:
            rel_preds[1 + np.argmax(rel_probs[tokens, root])] = root
        elif len(roots) > 1:
            root_probs = rel_probs[roots, root]
            rel_probs[roots, root] = 0
            new_rel_preds = np.argmax(rel_probs[roots], axis=1)
            new_rel_probs = rel_probs[roots, new_rel_preds] / root_probs
            new_root = roots[np.argmin(new_rel_probs)]
            rel_preds[roots] = new_rel_preds
            rel_preds[new_root] = root
        return rel_preds
    else:
        rel_preds = np.argmax(rel_probs, axis=1)
        return rel_preds


================================================
FILE: hanlp/components/parsers/biaffine_tf/layers.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-26 23:05
# Ported from the PyTorch implementation https://github.com/zysite/biaffine-parser
import tensorflow as tf
from hanlp.utils.tf_util import tf_bernoulli


class Biaffine(tf.keras.layers.Layer):
    def __init__(self, n_in, n_out=1, bias_x=True, bias_y=True, trainable=True, name=None, dtype=None, dynamic=False,
                 **kwargs):
        super().__init__(trainable, name, dtype, dynamic, **kwargs)
        self.n_in = n_in
        self.n_out = n_out
        self.bias_x = bias_x
        self.bias_y = bias_y
        self.weight = None

    def build(self, input_shape):
        self.weight = self.add_weight(name='kernel',
                                      shape=(self.n_out,
                                             self.n_in + self.bias_x,
                                             self.n_in + self.bias_y),
                                      initializer='zero')

    def extra_repr(self):
        s = f"n_in={self.n_in}, n_out={self.n_out}"
        if self.bias_x:
            s += f", bias_x={self.bias_x}"
        if self.bias_y:
            s += f", bias_y={self.bias_y}"

        return s

    # noinspection PyMethodOverriding
    def call(self, x, y, **kwargs):
        if self.bias_x:
            x = tf.concat((x, tf.ones_like(x[..., :1])), -1)
        if self.bias_y:
            y = tf.concat((y, tf.ones_like(y[..., :1])), -1)
        # [batch_size, n_out, seq_len, seq_len]
        s = tf.einsum('bxi,oij,byj->boxy', x, self.weight, y)
        # remove dim 1 if n_out == 1
        if self.n_out == 1:
            s = tf.squeeze(s, axis=1)

        return s


class MLP(tf.keras.layers.Layer):
    def __init__(self, n_hidden, dropout=0, trainable=True, name=None, dtype=None, dynamic=False, **kwargs):
        super().__init__(trainable, name, dtype, dynamic, **kwargs)
        self.linear = tf.keras.layers.Dense(n_hidden, kernel_initializer='orthogonal')
        self.activation = tf.keras.layers.LeakyReLU(0.1)
        self.dropout = SharedDropout(p=dropout)

    def call(self, x, **kwargs):
        x = self.linear(x)
        x = self.activation(x)
        x = self.dropout(x)

        return x


class SharedDropout(tf.keras.layers.Layer):

    def __init__(self, p=0.5, batch_first=True, trainable=True, name=None, dtype=None, dynamic=False, **kwargs):
        """Dropout on timesteps with bernoulli distribution"""
        super().__init__(trainable, name, dtype, dynamic, **kwargs)
        self.p = p
        self.batch_first = batch_first

    def extra_repr(self):
        s = f"p={self.p}"
        if self.batch_first:
            s += f", batch_first={self.batch_first}"

        return s

    def call(self, x, training=None, **kwargs):
        if training and self.p > 0:
            if self.batch_first:
                mask = self.get_mask(x[:, 0], self.p)
            else:
                mask = self.get_mask(x[0], self.p)
            x *= tf.expand_dims(mask, axis=1) if self.batch_first else mask

        return x

    @staticmethod
    def get_mask(x, p):
        mask = tf_bernoulli(tf.shape(x), 1 - p, x.dtype)
        mask = mask / (1 - p)

        return mask


class IndependentDropout(tf.keras.layers.Layer):

    def __init__(self, p=0.5, trainable=True, name=None, dtype=None, dynamic=False, **kwargs):
        """Dropout on the first two dimensions"""
        super().__init__(trainable, name, dtype, dynamic, **kwargs)
        self.p = p

    def extra_repr(self):
        return f"p={self.p}"

    def call(self, inputs, training=None, **kwargs):
        if training and self.p > 0:
            masks = [tf_bernoulli(tf.shape(x)[:2], 1 - self.p)
                     for x in inputs]
            total = sum(masks)
            scale = len(inputs) / tf.reduce_max(tf.ones_like(total))
            masks = [mask * scale for mask in masks]
            inputs = [item * tf.expand_dims(mask, axis=-1)
                      for item, mask in zip(inputs, masks)]

        return inputs


================================================
FILE: hanlp/components/parsers/biaffine_tf/model.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-26 23:04
import tensorflow as tf
from hanlp.layers.transformers.tf_imports import TFPreTrainedModel
from hanlp.components.parsers.biaffine_tf.layers import IndependentDropout, SharedDropout, Biaffine, MLP


class BiaffineModelTF(tf.keras.Model):

    def __init__(self, config, embed=None, transformer: TFPreTrainedModel = None):
        """An implementation of T. Dozat and C. D. Manning, “Deep Biaffine Attention for Neural Dependency Parsing.,” ICLR, 2017.
            Although I have my MXNet implementation, I found zysite's PyTorch implementation is cleaner so I port it to TensorFlow

        Args:
          config: param embed:

        Returns:

        """
        super(BiaffineModelTF, self).__init__()
        assert not (embed and transformer), 'Either pre-trained word embed and transformer is supported, but not both'
        normal = tf.keras.initializers.RandomNormal(stddev=1.)
        if not transformer:
            # the embedding layer
            self.word_embed = tf.keras.layers.Embedding(input_dim=config.n_words,
                                                        output_dim=config.n_embed,
                                                        embeddings_initializer=tf.keras.initializers.zeros() if embed
                                                        else normal,
                                                        name='word_embed')
            self.feat_embed = tf.keras.layers.Embedding(input_dim=config.n_feats,
                                                        output_dim=config.n_embed,
                                                        embeddings_initializer=tf.keras.initializers.zeros() if embed
                                                        else normal,
                                                        name='feat_embed')
            self.embed_dropout = IndependentDropout(p=config.embed_dropout, name='embed_dropout')

            # the word-lstm layer
            self.lstm = tf.keras.models.Sequential(name='lstm')
            for _ in range(config.n_lstm_layers):
                self.lstm.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
                    units=config.n_lstm_hidden,
                    dropout=config.lstm_dropout,
                    recurrent_dropout=config.lstm_dropout,
                    return_sequences=True,
                    kernel_initializer='orthogonal',
                    unit_forget_bias=False,  # turns out to hinder performance
                )))
            self.lstm_dropout = SharedDropout(p=config.lstm_dropout, name='lstm_dropout')
        else:
            self.transformer = transformer
            transformer_dropout = config.get('transformer_dropout', None)
            if transformer_dropout:
                self.transformer_dropout = SharedDropout(p=config.transformer_dropout, name='transformer_dropout')
            d_positional = config.get('d_positional', None)
            if d_positional:
                max_seq_length = config.get('max_seq_length', 256)
                self.position_table = self.add_weight(shape=(max_seq_length, d_positional),
                                                      initializer='random_normal',
                                                      trainable=True)
        # the MLP layers
        self.mlp_arc_h = MLP(n_hidden=config.n_mlp_arc,
                             dropout=config.mlp_dropout, name='mlp_arc_h')
        self.mlp_arc_d = MLP(n_hidden=config.n_mlp_arc,
                             dropout=config.mlp_dropout, name='mlp_arc_d')
        self.mlp_rel_h = MLP(n_hidden=config.n_mlp_rel,
                             dropout=config.mlp_dropout, name='mlp_rel_h')
        self.mlp_rel_d = MLP(n_hidden=config.n_mlp_rel,
                             dropout=config.mlp_dropout, name='mlp_rel_d')

        # the Biaffine layers
        self.arc_attn = Biaffine(n_in=config.n_mlp_arc,
                                 bias_x=True,
                                 bias_y=False, name='arc_attn')
        self.rel_attn = Biaffine(n_in=config.n_mlp_rel,
                                 n_out=config.n_rels,
                                 bias_x=True,
                                 bias_y=True, name='rel_attn')
        if embed is not None:
            self.pretrained = embed
        self.pad_index = tf.constant(config.pad_index, dtype=tf.int64)
        self.unk_index = tf.constant(config.unk_index, dtype=tf.int64)

    # noinspection PyMethodOverriding
    def call(self, inputs, mask_inf=True, **kwargs):
        # batch_size, seq_len = words.shape
        # get the mask and lengths of given batch
        # mask = words.ne(self.pad_index)
        if hasattr(self, 'lstm'):
            words, feats = inputs
            mask = tf.not_equal(words, self.pad_index)
            # set the indices larger than num_embeddings to unk_index
            # ext_mask = words.ge(self.word_embed.num_embeddings)
            ext_mask = tf.greater_equal(words, self.word_embed.input_dim)
            ext_words = tf.where(ext_mask, self.unk_index, words)

            # get outputs from embedding layers
            word_embed = self.word_embed(ext_words)
            if hasattr(self, 'pretrained'):
                word_embed += self.pretrained(words)
            feat_embed = self.feat_embed(feats)
            word_embed, feat_embed = self.embed_dropout([word_embed, feat_embed])
            # concatenate the word and feat representations
            embed = tf.concat((word_embed, feat_embed), axis=-1)

            x = self.lstm(embed, mask=mask)
            x = self.lstm_dropout(x)
        else:
            words, (input_ids, input_mask, prefix_offset) = inputs
            mask = tf.not_equal(words, self.pad_index)
            x = self.run_transformer(input_ids, input_mask, prefix_offset)

        # apply MLPs to the BiLSTM output states
        arc_h = self.mlp_arc_h(x)
        arc_d = self.mlp_arc_d(x)
        rel_h = self.mlp_rel_h(x)
        rel_d = self.mlp_rel_d(x)

        # get arc and rel scores from the bilinear attention
        # [batch_size, seq_len, seq_len]
        s_arc = self.arc_attn(arc_d, arc_h)
        # [batch_size, seq_len, seq_len, n_rels]
        s_rel = tf.transpose(self.rel_attn(rel_d, rel_h), [0, 2, 3, 1])
        # set the scores that exceed the length of each sentence to -inf
        if mask_inf:
            s_arc = tf.where(tf.expand_dims(mask, 1), s_arc, float('-inf'))

        return s_arc, s_rel

    def run_transformer(self, input_ids, input_mask, prefix_offset):
        if isinstance(self.transformer, TFPreTrainedModel):
            sequence_output = self.transformer([input_ids, input_mask])
            sequence_output = sequence_output[0]
        else:
            sequence_output = self.transformer([input_ids, tf.zeros_like(input_ids)], mask=input_mask)
        x = tf.gather(sequence_output, prefix_offset, batch_dims=1)
        if hasattr(self, 'transformer_dropout'):
            x = self.transformer_dropout(x)
        if hasattr(self, 'position_table'):
            batch_size, seq_length = tf.shape(x)[:2]
            timing_signal = tf.broadcast_to(self.position_table[:seq_length],
                                            [batch_size, seq_length, self.position_table.shape[-1]])
            x = tf.concat([x, timing_signal], axis=-1)
        return x

    def to_functional(self):
        words = tf.keras.Input(shape=[None], dtype=tf.int64, name='words')
        feats = tf.keras.Input(shape=[None], dtype=tf.int64, name='feats')
        s_arc, s_rel = self.call([words, feats], mask_inf=False)
        return tf.keras.Model(inputs=[words, feats], outputs=[s_arc, s_rel])


================================================
FILE: hanlp/components/parsers/chu_liu_edmonds.py
================================================
# Adopted from https://github.com/allenai/allennlp under Apache Licence 2.0.
# Changed the packaging.

from typing import List, Set, Tuple, Dict
import numpy


def decode_mst(
        energy: numpy.ndarray, length: int, has_labels: bool = True
) -> Tuple[numpy.ndarray, numpy.ndarray]:
    """Note: Counter to typical intuition, this function decodes the _maximum_
    spanning tree.
    
    Decode the optimal MST tree with the Chu-Liu-Edmonds algorithm for
    maximum spanning arborescences on graphs.
    
    Adopted from https://github.com/allenai/allennlp/blob/master/allennlp/nn/chu_liu_edmonds.py
    which is licensed under the Apache License 2.0
    
    # Parameters
    
    energy : `numpy.ndarray`, required.
        A tensor with shape (num_labels, timesteps, timesteps)
        containing the energy of each edge. If has_labels is `False`,
        the tensor should have shape (timesteps, timesteps) instead.
    length : `int`, required.
        The length of this sequence, as the energy may have come
        from a padded batch.
    has_labels : `bool`, optional, (default = True)
        Whether the graph has labels or not.

    Args:
      energy: numpy.ndarray: 
      length: int: 
      has_labels: bool:  (Default value = True)

    Returns:

    """
    if has_labels and energy.ndim != 3:
        raise ValueError("The dimension of the energy array is not equal to 3.")
    elif not has_labels and energy.ndim != 2:
        raise ValueError("The dimension of the energy array is not equal to 2.")
    input_shape = energy.shape
    max_length = input_shape[-1]

    # Our energy matrix might have been batched -
    # here we clip it to contain only non padded tokens.
    if has_labels:
        energy = energy[:, :length, :length]
        # get best label for each edge.
        label_id_matrix = energy.argmax(axis=0)
        energy = energy.max(axis=0)
    else:
        energy = energy[:length, :length]
        label_id_matrix = None
    # get original score matrix
    original_score_matrix = energy
    # initialize score matrix to original score matrix
    score_matrix = numpy.array(original_score_matrix, copy=True)

    old_input = numpy.zeros([length, length], dtype=numpy.int32)
    old_output = numpy.zeros([length, length], dtype=numpy.int32)
    current_nodes = [True for _ in range(length)]
    representatives: List[Set[int]] = []

    for node1 in range(length):
        original_score_matrix[node1, node1] = 0.0
        score_matrix[node1, node1] = 0.0
        representatives.append({node1})

        for node2 in range(node1 + 1, length):
            old_input[node1, node2] = node1
            old_output[node1, node2] = node2

            old_input[node2, node1] = node2
            old_output[node2, node1] = node1

    final_edges: Dict[int, int] = {}

    # The main algorithm operates inplace.
    chu_liu_edmonds(
        length, score_matrix, current_nodes, final_edges, old_input, old_output, representatives
    )

    heads = numpy.zeros([max_length], numpy.int32)
    if has_labels:
        head_type = numpy.ones([max_length], numpy.int32)
    else:
        head_type = None

    for child, parent in final_edges.items():
        heads[child] = parent
        if has_labels:
            head_type[child] = label_id_matrix[parent, child]

    return heads, head_type


def chu_liu_edmonds(
        length: int,
        score_matrix: numpy.ndarray,
        current_nodes: List[bool],
        final_edges: Dict[int, int],
        old_input: numpy.ndarray,
        old_output: numpy.ndarray,
        representatives: List[Set[int]],
):
    """Applies the chu-liu-edmonds algorithm recursively
    to a graph with edge weights defined by score_matrix.
    
    Note that this function operates in place, so variables
    will be modified.
    
    # Parameters
    
    length : `int`, required.
        The number of nodes.
    score_matrix : `numpy.ndarray`, required.
        The score matrix representing the scores for pairs
        of nodes.
    current_nodes : `List[bool]`, required.
        The nodes which are representatives in the graph.
        A representative at it's most basic represents a node,
        but as the algorithm progresses, individual nodes will
        represent collapsed cycles in the graph.
    final_edges : `Dict[int, int]`, required.
        An empty dictionary which will be populated with the
        nodes which are connected in the maximum spanning tree.
    old_input : `numpy.ndarray`, required.
    old_output : `numpy.ndarray`, required.
    representatives : `List[Set[int]]`, required.
        A list containing the nodes that a particular node
        is representing at this iteration in the graph.
    
    # Returns
    
    Nothing - all variables are modified in place.

    Args:
      length: int: 
      score_matrix: numpy.ndarray: 
      current_nodes: List[bool]: 
      final_edges: Dict[int: 
      int]: 
      old_input: numpy.ndarray: 
      old_output: numpy.ndarray: 
      representatives: List[Set[int]]: 

    Returns:

    """
    # Set the initial graph to be the greedy best one.
    parents = [-1]
    for node1 in range(1, length):
        parents.append(0)
        if current_nodes[node1]:
            max_score = score_matrix[0, node1]
            for node2 in range(1, length):
                if node2 == node1 or not current_nodes[node2]:
                    continue

                new_score = score_matrix[node2, node1]
                if new_score > max_score:
                    max_score = new_score
                    parents[node1] = node2

    # Check if this solution has a cycle.
    has_cycle, cycle = _find_cycle(parents, length, current_nodes)
    # If there are no cycles, find all edges and return.
    if not has_cycle:
        final_edges[0] = -1
        for node in range(1, length):
            if not current_nodes[node]:
                continue

            parent = old_input[parents[node], node]
            child = old_output[parents[node], node]
            final_edges[child] = parent
        return

    # Otherwise, we have a cycle so we need to remove an edge.
    # From here until the recursive call is the contraction stage of the algorithm.
    cycle_weight = 0.0
    # Find the weight of the cycle.
    index = 0
    for node in cycle:
        index += 1
        cycle_weight += score_matrix[parents[node], node]

    # For each node in the graph, find the maximum weight incoming
    # and outgoing edge into the cycle.
    cycle_representative = cycle[0]
    for node in range(length):
        if not current_nodes[node] or node in cycle:
            continue

        in_edge_weight = float("-inf")
        in_edge = -1
        out_edge_weight = float("-inf")
        out_edge = -1

        for node_in_cycle in cycle:
            if score_matrix[node_in_cycle, node] > in_edge_weight:
                in_edge_weight = score_matrix[node_in_cycle, node]
                in_edge = node_in_cycle

            # Add the new edge score to the cycle weight
            # and subtract the edge we're considering removing.
            score = (
                    cycle_weight
                    + score_matrix[node, node_in_cycle]
                    - score_matrix[parents[node_in_cycle], node_in_cycle]
            )

            if score > out_edge_weight:
                out_edge_weight = score
                out_edge = node_in_cycle

        score_matrix[cycle_representative, node] = in_edge_weight
        old_input[cycle_representative, node] = old_input[in_edge, node]
        old_output[cycle_representative, node] = old_output[in_edge, node]

        score_matrix[node, cycle_representative] = out_edge_weight
        old_output[node, cycle_representative] = old_output[node, out_edge]
        old_input[node, cycle_representative] = old_input[node, out_edge]

    # For the next recursive iteration, we want to consider the cycle as a
    # single node. Here we collapse the cycle into the first node in the
    # cycle (first node is arbitrary), set all the other nodes not be
    # considered in the next iteration. We also keep track of which
    # representatives we are considering this iteration because we need
    # them below to check if we're done.
    considered_representatives: List[Set[int]] = []
    for i, node_in_cycle in enumerate(cycle):
        considered_representatives.append(set())
        if i > 0:
            # We need to consider at least one
            # node in the cycle, arbitrarily choose
            # the first.
            current_nodes[node_in_cycle] = False

        for node in representatives[node_in_cycle]:
            considered_representatives[i].add(node)
            if i > 0:
                representatives[cycle_representative].add(node)

    chu_liu_edmonds(
        length, score_matrix, current_nodes, final_edges, old_input, old_output, representatives
    )

    # Expansion stage.
    # check each node in cycle, if one of its representatives
    # is a key in the final_edges, it is the one we need.
    found = False
    key_node = -1
    for i, node in enumerate(cycle):
        for cycle_rep in considered_representatives[i]:
            if cycle_rep in final_edges:
                key_node = node
                found = True
                break
        if found:
            break

    previous = parents[key_node]
    while previous != key_node:
        child = old_output[parents[previous], previous]
        parent = old_input[parents[previous], previous]
        final_edges[child] = parent
        previous = parents[previous]


def _find_cycle(
        parents: List[int], length: int, current_nodes: List[bool]
) -> Tuple[bool, List[int]]:
    added = [False for _ in range(length)]
    added[0] = True
    cycle = set()
    has_cycle = False
    for i in range(1, length):
        if has_cycle:
            break
        # don't redo nodes we've already
        # visited or aren't considering.
        if added[i] or not current_nodes[i]:
            continue
        # Initialize a new possible cycle.
        this_cycle = set()
        this_cycle.add(i)
        added[i] = True
        has_cycle = True
        next_node = i
        while parents[next_node] not in this_cycle:
            next_node = parents[next_node]
            # If we see a node we've already processed,
            # we can stop, because the node we are
            # processing would have been in that cycle.
            if added[next_node]:
                has_cycle = False
                break
            added[next_node] = True
            this_cycle.add(next_node)

        if has_cycle:
            original = next_node
            cycle.add(original)
            next_node = parents[original]
            while next_node != original:
                cycle.add(next_node)
                next_node = parents[next_node]
            break

    return has_cycle, list(cycle)


================================================
FILE: hanlp/components/parsers/conll.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-26 15:37
from typing import Union

from hanlp.utils.io_util import get_resource, TimingFileIterator
from hanlp.utils.log_util import logger


def collapse_enhanced_empty_nodes(sent: list):
    collapsed = []
    for cells in sent:
        if isinstance(cells[0], float):
            id = cells[0]
            head, deprel = cells[8].split(':', 1)
            for x in sent:
                arrows = [s.split(':', 1) for s in x[8].split('|')]
                arrows = [(head, f'{head}:{deprel}>{r}') if h == str(id) else (h, r) for h, r in arrows]
                arrows = sorted(arrows)
                x[8] = '|'.join(f'{h}:{r}' for h, r in arrows)
            sent[head][7] += f'>{cells[7]}'
        else:
            collapsed.append(cells)
    return collapsed


def read_conll(filepath: Union[str, TimingFileIterator], underline_to_none=False, enhanced_collapse_empty_nodes=False):
    sent = []
    if isinstance(filepath, str):
        filepath: str = get_resource(filepath)
        if filepath.endswith('.conllu') and enhanced_collapse_empty_nodes is None:
            enhanced_collapse_empty_nodes = True
        src = open(filepath, encoding='utf-8')
    else:
        src = filepath
    for idx, line in enumerate(src):
        if line.startswith('#'):
            continue
        line = line.strip()
        cells = line.split('\t')
        if line and cells:
            if enhanced_collapse_empty_nodes and '.' in cells[0]:
                cells[0] = float(cells[0])
                cells[6] = None
            else:
                if '-' in cells[0] or '.' in cells[0]:
                    # sent[-1][1] += cells[1]
                    continue
                cells[0] = int(cells[0])
                if cells[6] != '_':
                    try:
                        cells[6] = int(cells[6])
                    except ValueError:
                        cells[6] = 0
                        logger.exception(f'Wrong CoNLL format {filepath}:{idx + 1}\n{line}')
            if underline_to_none:
                for i, x in enumerate(cells):
                    if x == '_':
                        cells[i] = None
            sent.append(cells)
        else:
            if enhanced_collapse_empty_nodes:
                sent = collapse_enhanced_empty_nodes(sent)
            yield sent
            sent = []

    if sent:
        if enhanced_collapse_empty_nodes:
            sent = collapse_enhanced_empty_nodes(sent)
        yield sent

    src.close()


================================================
FILE: hanlp/components/parsers/constituency/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-28 19:26


================================================
FILE: hanlp/components/parsers/constituency/crf_constituency_model.py
================================================
# -*- coding:utf-8 -*-
# Adopted from https://github.com/yzhangcs/parser
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import torch
from torch import nn
from hanlp.components.parsers.constituency.treecrf import CRFConstituency
from hanlp.components.parsers.alg import cky
from hanlp.components.parsers.biaffine.biaffine import Biaffine
from hanlp.components.parsers.biaffine.mlp import MLP


class CRFConstituencyDecoder(nn.Module):
    r"""
    The implementation of CRF Constituency Parser,
    also called FANCY (abbr. of Fast and Accurate Neural Crf constituencY) Parser.

    References:
        - Yu Zhang, Houquan Zhou and Zhenghua Li. 2020.
          `Fast and Accurate Neural CRF Constituency Parsing`_.

    Args:
        n_words (int):
            The size of the word vocabulary.
        n_feats (int):
            The size of the feat vocabulary.
        n_labels (int):
            The number of labels.
        feat (str):
            Specifies which type of additional feature to use: ``'char'`` | ``'bert'`` | ``'tag'``.
            ``'char'``: Character-level representations extracted by CharLSTM.
            ``'bert'``: BERT representations, other pretrained langugae models like XLNet are also feasible.
            ``'tag'``: POS tag embeddings.
            Default: 'char'.
        n_embed (int):
            The size of word embeddings. Default: 100.
        n_feat_embed (int):
            The size of feature representations. Default: 100.
        n_char_embed (int):
            The size of character embeddings serving as inputs of CharLSTM, required if ``feat='char'``. Default: 50.
        bert (str):
            Specifies which kind of language model to use, e.g., ``'bert-base-cased'`` and ``'xlnet-base-cased'``.
            This is required if ``feat='bert'``. The full list can be found in `transformers`.
            Default: ``None``.
        n_bert_layers (int):
            Specifies how many last layers to use. Required if ``feat='bert'``.
            The final outputs would be the weight sum of the hidden states of these layers.
            Default: 4.
        mix_dropout (float):
            The dropout ratio of BERT layers. Required if ``feat='bert'``. Default: .0.
        embed_dropout (float):
            The dropout ratio of input embeddings. Default: .33.
        n_hidden (int):
            The size of LSTM hidden states. Default: 400.
        n_lstm_layers (int):
            The number of LSTM layers. Default: 3.
        lstm_dropout (float):
            The dropout ratio of LSTM. Default: .33.
        n_mlp_span (int):
            Span MLP size. Default: 500.
        n_mlp_label  (int):
            Label MLP size. Default: 100.
        mlp_dropout (float):
            The dropout ratio of MLP layers. Default: .33.
        feat_pad_index (int):
            The index of the padding token in the feat vocabulary. Default: 0.
        pad_index (int):
            The index of the padding token in the word vocabulary. Default: 0.
        unk_index (int):
            The index of the unknown token in the word vocabulary. Default: 1.

    .. _Fast and Accurate Neural CRF Constituency Parsing:
        https://www.ijcai.org/Proceedings/2020/560/
    .. _transformers:
        https://github.com/huggingface/transformers
    """

    def __init__(self,
                 n_labels,
                 n_hidden=400,
                 n_mlp_span=500,
                 n_mlp_label=100,
                 mlp_dropout=.33,
                 **kwargs
                 ):
        super().__init__()

        # the MLP layers
        self.mlp_span_l = MLP(n_in=n_hidden, n_out=n_mlp_span, dropout=mlp_dropout)
        self.mlp_span_r = MLP(n_in=n_hidden, n_out=n_mlp_span, dropout=mlp_dropout)
        self.mlp_label_l = MLP(n_in=n_hidden, n_out=n_mlp_label, dropout=mlp_dropout)
        self.mlp_label_r = MLP(n_in=n_hidden, n_out=n_mlp_label, dropout=mlp_dropout)

        # the Biaffine layers
        self.span_attn = Biaffine(n_in=n_mlp_span, bias_x=True, bias_y=False)
        self.label_attn = Biaffine(n_in=n_mlp_label, n_out=n_labels, bias_x=True, bias_y=True)
        self.crf = CRFConstituency()
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x, **kwargs):
        r"""
        Args:
            x (~torch.FloatTensor): ``[batch_size, seq_len, hidden_dim]``.
                Hidden states from encoder.

        Returns:
            ~torch.Tensor, ~torch.Tensor:
                The first tensor of shape ``[batch_size, seq_len, seq_len]`` holds scores of all possible spans.
                The second of shape ``[batch_size, seq_len, seq_len, n_labels]`` holds
                scores of all possible labels on each span.
        """

        x_f, x_b = x.chunk(2, -1)
        x = torch.cat((x_f[:, :-1], x_b[:, 1:]), -1)
        # apply MLPs to the BiLSTM output states
        span_l = self.mlp_span_l(x)
        span_r = self.mlp_span_r(x)
        label_l = self.mlp_label_l(x)
        label_r = self.mlp_label_r(x)

        # [batch_size, seq_len, seq_len]
        s_span = self.span_attn(span_l, span_r)
        # [batch_size, seq_len, seq_len, n_labels]
        s_label = self.label_attn(label_l, label_r).permute(0, 2, 3, 1)

        return s_span, s_label

    def loss(self, s_span, s_label, charts, mask, mbr=True):
        r"""
        Args:
            s_span (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
                Scores of all spans
            s_label (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``.
                Scores of all labels on each span.
            charts (~torch.LongTensor): ``[batch_size, seq_len, seq_len]``.
                The tensor of gold-standard labels, in which positions without labels are filled with -1.
            mask (~torch.BoolTensor): ``[batch_size, seq_len, seq_len]``.
                The mask for covering the unpadded tokens in each chart.
            mbr (bool):
                If ``True``, returns marginals for MBR decoding. Default: ``True``.

        Returns:
            ~torch.Tensor, ~torch.Tensor:
                The training loss and
                original span scores of shape ``[batch_size, seq_len, seq_len]`` if ``mbr=False``, or marginals otherwise.
        """

        span_mask = charts.ge(0) & mask
        span_loss, span_probs = self.crf(s_span, mask, span_mask, mbr)
        label_loss = self.criterion(s_label[span_mask], charts[span_mask])
        loss = span_loss + label_loss

        return loss, span_probs

    def decode(self, s_span, s_label, mask):
        r"""
        Args:
            s_span (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
                Scores of all spans.
            s_label (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``.
                Scores of all labels on each span.
            mask (~torch.BoolTensor): ``[batch_size, seq_len, seq_len]``.
                The mask for covering the unpadded tokens in each chart.

        Returns:
            list[list[tuple]]:
                Sequences of factorized labeled trees traversed in pre-order.
        """

        span_preds = cky(s_span, mask)
        label_preds = s_label.argmax(-1).tolist()
        return [[(i, j, labels[i][j]) for i, j in spans] for spans, labels in zip(span_preds, label_preds)]


class CRFConstituencyModel(nn.Module):

    def __init__(self, encoder, decoder: CRFConstituencyDecoder) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, batch):
        r"""
        Args:
            batch (~dict):
                Batch of input data.

        Returns:
            ~torch.Tensor, ~torch.Tensor:
                The first tensor of shape ``[batch_size, seq_len, seq_len]`` holds scores of all possible spans.
                The second of shape ``[batch_size, seq_len, seq_len, n_labels]`` holds
                scores of all possible labels on each span.
        """
        x = self.encoder(batch)
        return self.decoder(x)


================================================
FILE: hanlp/components/parsers/constituency/crf_constituency_parser.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-28 21:24
import logging
from typing import Union, List

import torch
from phrasetree.tree import Tree
from torch.utils.data import DataLoader

from hanlp_common.constant import BOS, EOS, IDX
from hanlp.common.dataset import TransformableDataset, SamplerBuilder, PadSequenceDataLoader
from hanlp.common.structure import History
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import FieldLength, TransformList
from hanlp.common.vocab import VocabWithNone
from hanlp.components.classifiers.transformer_classifier import TransformerComponent
from hanlp.datasets.parsing.loaders.constituency_dataset import ConstituencyDataset, unpack_tree_to_features, \
    build_tree, factorize, remove_subcategory
from hanlp.components.parsers.constituency.crf_constituency_model import CRFConstituencyDecoder, CRFConstituencyModel
from hanlp.metrics.parsing.span import SpanMetric
from hanlp.utils.time_util import CountdownTimer
from hanlp.utils.torch_util import clip_grad_norm
from hanlp_common.util import merge_locals_kwargs, merge_dict, reorder


class CRFConstituencyParser(TorchComponent):
    def __init__(self, **kwargs) -> None:
        """Two-stage CRF Parsing (:cite:`ijcai2020-560`).

        Args:
            **kwargs: Predefined config.
        """
        super().__init__(**kwargs)
        self.model: CRFConstituencyModel = self.model

    def build_optimizer(self, trn, **kwargs):
        # noinspection PyCallByClass,PyTypeChecker
        return TransformerComponent.build_optimizer(self, trn, **kwargs)

    def build_criterion(self, decoder=None, **kwargs):
        return decoder

    def build_metric(self, **kwargs):
        return SpanMetric()

    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
                              logger: logging.Logger, devices, ratio_width=None, patience=0.5, eval_trn=True, **kwargs):
        if isinstance(patience, float):
            patience = int(patience * epochs)
        best_epoch, best_metric = 0, -1
        timer = CountdownTimer(epochs)
        history = History()
        for epoch in range(1, epochs + 1):
            logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
            self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history, ratio_width=ratio_width,
                                eval_trn=eval_trn, **self.config)
            loss, dev_metric = self.evaluate_dataloader(dev, criterion, logger=logger, ratio_width=ratio_width)
            timer.update()
            report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}"
            if dev_metric > best_metric:
                best_epoch, best_metric = epoch, dev_metric
                self.save_weights(save_dir)
                report += ' [red](saved)[/red]'
            else:
                report += f' ({epoch - best_epoch})'
                if epoch - best_epoch >= patience:
                    report += ' early stop'
            logger.info(report)
            if epoch - best_epoch >= patience:
                break
        if not best_epoch:
            self.save_weights(save_dir)
        elif best_epoch != epoch:
            self.load_weights(save_dir)
        logger.info(f"Max score of dev is {best_metric} at epoch {best_epoch}")
        logger.info(f"Average time of each epoch is {timer.elapsed_average_human}")
        logger.info(f"{timer.elapsed_human} elapsed")

    # noinspection PyMethodOverriding
    def fit_dataloader(self,
                       trn: DataLoader,
                       criterion,
                       optimizer,
                       metric: SpanMetric,
                       logger: logging.Logger,
                       history: History,
                       gradient_accumulation=1,
                       grad_norm=None,
                       ratio_width=None,
                       eval_trn=True,
                       **kwargs):
        optimizer, scheduler = optimizer
        metric.reset()
        self.model.train()
        timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation))
        total_loss = 0
        for idx, batch in enumerate(trn):
            out, mask = self.feed_batch(batch)
            y = batch['chart_id']
            loss, span_probs = self.compute_loss(out, y, mask)
            if gradient_accumulation and gradient_accumulation > 1:
                loss /= gradient_accumulation
            loss.backward()
            total_loss += loss.item()
            if eval_trn:
                prediction = self.decode_output(out, mask, batch, span_probs)
                self.update_metrics(metric, batch, prediction)
            if history.step(gradient_accumulation):
                self._step(optimizer, scheduler, grad_norm)
                report = f'loss: {total_loss / (idx + 1):.4f} {metric}' if eval_trn \
                    else f'loss: {total_loss / (idx + 1):.4f}'
                timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width)
            del loss
            del out
            del mask

    def decode_output(self, out, mask, batch, span_probs=None, decoder=None, tokens=None):
        s_span, s_label = out
        if not decoder:
            decoder = self.model.decoder
        if mask.any().item():
            if span_probs is None:
                if self.config.mbr:
                    s_span = decoder.crf(s_span, mask, mbr=True)
            else:
                s_span = span_probs
            chart_preds = decoder.decode(s_span, s_label, mask)
        else:
            chart_preds = [[]] * len(tokens)
        idx_to_token = self.vocabs.chart.idx_to_token
        if tokens is None:
            tokens = batch.get('token_', None)  # Use the original tokens if any
            if tokens is None:
                tokens = batch['token']
            tokens = [x[1:-1] for x in tokens]
        trees = [build_tree(token, [(i, j, idx_to_token[label]) for i, j, label in chart]) for token, chart in
                 zip(tokens, chart_preds)]
        # probs = [prob[:i - 1, 1:i].cpu() for i, prob in zip(lens, s_span.unbind())]
        return trees

    def update_metrics(self, metric, batch, prediction):
        # Add pre-terminals (pos tags) back to prediction for safe factorization (deletion based on pos)
        for pred, gold in zip(prediction, batch['constituency']):
            pred: Tree = pred
            gold: Tree = gold
            for p, g in zip(pred.subtrees(lambda t: t.height() == 2), gold.pos()):
                token, pos = g
                p: Tree = p
                assert p.label() == '_'
                p.set_label(pos)
        metric([factorize(tree, self.config.delete, self.config.equal) for tree in prediction],
               [factorize(tree, self.config.delete, self.config.equal) for tree in batch['constituency']])
        return metric

    def feed_batch(self, batch: dict):
        mask = self.compute_mask(batch)
        s_span, s_label = self.model(batch)
        return (s_span, s_label), mask

    def compute_mask(self, batch, offset=1):
        lens = batch['token_length'] - offset
        seq_len = lens.max()
        mask = lens.new_tensor(range(seq_len)) < lens.view(-1, 1, 1)
        mask = mask & mask.new_ones(seq_len, seq_len).triu_(1)
        return mask

    def compute_loss(self, out, y, mask, crf_decoder=None):
        if not crf_decoder:
            crf_decoder = self.model.decoder
        loss, span_probs = crf_decoder.loss(out[0], out[1], y, mask, self.config.mbr)
        if loss < 0:  # wired negative loss
            loss *= 0
        return loss, span_probs

    def _step(self, optimizer, scheduler, grad_norm):
        clip_grad_norm(self.model, grad_norm)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    @torch.no_grad()
    def evaluate_dataloader(self, data, criterion, logger=None, ratio_width=None, metric=None, output=None, **kwargs):
        self.model.eval()
        total_loss = 0
        if not metric:
            metric = self.build_metric()
        else:
            metric.reset()
        timer = CountdownTimer(len(data))
        for idx, batch in enumerate(data):
            out, mask = self.feed_batch(batch)
            y = batch['chart_id']
            loss, span_probs = self.compute_loss(out, y, mask)
            total_loss += loss.item()
            prediction = self.decode_output(out, mask, batch, span_probs)
            self.update_metrics(metric, batch, prediction)
            timer.log(f'loss: {total_loss / (idx + 1):.4f} {metric}', ratio_percentage=False, logger=logger,
                      ratio_width=ratio_width)
        total_loss /= len(data)
        if output:
            output.close()
        return total_loss, metric

    # noinspection PyMethodOverriding
    def build_model(self, encoder, training=True, **kwargs) -> torch.nn.Module:
        decoder = CRFConstituencyDecoder(n_labels=len(self.vocabs.chart), n_hidden=encoder.get_output_dim(), **kwargs)
        encoder = encoder.module(vocabs=self.vocabs, training=training)
        return CRFConstituencyModel(encoder, decoder)

    def build_dataloader(self,
                         data,
                         batch_size,
                         sampler_builder: SamplerBuilder = None,
                         gradient_accumulation=1,
                         shuffle=False,
                         device=None,
                         logger: logging.Logger = None,
                         **kwargs) -> DataLoader:
        if isinstance(data, TransformableDataset):
            dataset = data
        else:
            transform = self.config.encoder.transform()
            if self.config.get('transform', None):
                transform = TransformList(self.config.transform, transform)
            dataset = self.build_dataset(data, transform, logger)
        if self.vocabs.mutable:
            # noinspection PyTypeChecker
            self.build_vocabs(dataset, logger)
        lens = [len(x['token_input_ids']) for x in dataset]
        if sampler_builder:
            sampler = sampler_builder.build(lens, shuffle, gradient_accumulation)
        else:
            sampler = None
        return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler)

    def predict(self, data: Union[str, List[str]], **kwargs):
        if not data:
            return []
        flat = self.input_is_flat(data)
        if flat:
            data = [data]
        samples = self.build_samples(data)
        dataloader = self.build_dataloader(samples, device=self.device, **kwargs)
        outputs = []
        orders = []
        for idx, batch in enumerate(dataloader):
            out, mask = self.feed_batch(batch)
            prediction = self.decode_output(out, mask, batch, span_probs=None)
            # prediction = [x[0] for x in prediction]
            outputs.extend(prediction)
            orders.extend(batch[IDX])
        outputs = reorder(outputs, orders)
        if flat:
            return outputs[0]
        return outputs

    def input_is_flat(self, data):
        return isinstance(data[0], str)

    def build_samples(self, data):
        return [{'token': [BOS] + token + [EOS]} for token in data]

    # noinspection PyMethodOverriding
    def fit(self,
            trn_data,
            dev_data,
            save_dir,
            encoder,
            lr=5e-5,
            transformer_lr=None,
            adam_epsilon=1e-8,
            weight_decay=0,
            warmup_steps=0.1,
            grad_norm=1.0,
            n_mlp_span=500,
            n_mlp_label=100,
            mlp_dropout=.33,
            batch_size=None,
            batch_max_tokens=5000,
            gradient_accumulation=1,
            epochs=30,
            patience=0.5,
            mbr=True,
            sampler_builder=None,
            delete=('', ':', '``', "''", '.', '?', '!', '-NONE-', 'TOP', ',', 'S1'),
            equal=(('ADVP', 'PRT'),),
            no_subcategory=True,
            eval_trn=True,
            transform=None,
            devices=None,
            logger=None,
            seed=None,
            **kwargs):
        if isinstance(equal, tuple):
            equal = dict(equal)
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def build_dataset(self, data, transform, logger=None):
        _transform = [
            unpack_tree_to_features,
            self.vocabs,
            FieldLength('token'),
            transform
        ]
        if self.config.get('no_subcategory', True):
            _transform.insert(0, remove_subcategory)
        dataset = ConstituencyDataset(data,
                                      transform=_transform,
                                      cache=isinstance(data, str))
        return dataset

    def build_vocabs(self, trn, logger, **kwargs):
        self.vocabs.chart = VocabWithNone(pad_token=None, unk_token=None)
        timer = CountdownTimer(len(trn))
        max_seq_len = 0
        for each in trn:
            max_seq_len = max(max_seq_len, len(each['token_input_ids']))
            timer.log(f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})')
        self.vocabs.chart.set_unk_as_safe_unk()
        self.vocabs.lock()
        self.vocabs.summary(logger)


================================================
FILE: hanlp/components/parsers/constituency/treecrf.py
================================================
# -*- coding:utf-8 -*-
# Adopted from https://github.com/yzhangcs/parser
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import torch
import torch.autograd as autograd
import torch.nn as nn

from hanlp.components.parsers.alg import stripe, istree, eisner, mst, eisner2o


class CRFConstituency(nn.Module):
    r"""
    TreeCRF for calculating partition functions and marginals in :math:`O(n^3)` for constituency trees.

    References:
        - Yu Zhang, houquan Zhou and Zhenghua Li. 2020.
          `Fast and Accurate Neural CRF Constituency Parsing`_.

    .. _Fast and Accurate Neural CRF Constituency Parsing:
        https://www.ijcai.org/Proceedings/2020/560/
    """

    @torch.enable_grad()
    def forward(self, scores, mask, target=None, mbr=False):
        r"""
        Args:
            scores (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
                Scores of all possible constituents.
            mask (~torch.BoolTensor): ``[batch_size, seq_len, seq_len]``.
                The mask to avoid parsing over padding tokens.
                For each square matrix in a batch, the positions except upper triangular part should be masked out.
            target (~torch.BoolTensor): ``[batch_size, seq_len, seq_len]``.
                The tensor of gold-standard constituents. ``True`` if a constituent exists. Default: ``None``.
            mbr (bool):
                If ``True``, marginals will be returned to perform minimum Bayes-risk (MBR) decoding. Default: ``False``.

        Returns:
            ~torch.Tensor, ~torch.Tensor:
                The first is the training loss averaged by the number of tokens, which won't be returned if ``target=None``.
                The second is a tensor of shape ``[batch_size, seq_len, seq_len]``, in which are marginals if ``mbr=True``,
                or original scores otherwise.
        """

        training = scores.requires_grad
        # always enable the gradient computation of scores in order for the computation of marginals
        logZ = self.inside(scores.requires_grad_(), mask)
        # marginals are used for decoding, and can be computed by combining the inside pass and autograd mechanism
        probs = scores
        if mbr:
            probs, = autograd.grad(logZ, scores, retain_graph=training)
        if target is None:
            return probs
        loss = (logZ - scores[mask & target].sum()) / mask[:, 0].sum()

        return loss, probs

    def inside(self, scores, mask):
        lens = mask[:, 0].sum(-1)
        batch_size, seq_len, _ = scores.shape
        # [seq_len, seq_len, batch_size]
        scores, mask = scores.permute(1, 2, 0), mask.permute(1, 2, 0)
        s = torch.full_like(scores, float('-inf'))

        for w in range(1, seq_len):
            # n denotes the number of spans to iterate,
            # from span (0, w) to span (n, n+w) given width w
            n = seq_len - w

            if w == 1:
                s.diagonal(w).copy_(scores.diagonal(w))
                continue
            # [n, w, batch_size]
            s_s = stripe(s, n, w - 1, (0, 1)) + stripe(s, n, w - 1, (1, w), 0)
            # [batch_size, n, w]
            s_s = s_s.permute(2, 0, 1)
            if s_s.requires_grad:
                s_s.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0))
            s_s = s_s.logsumexp(-1)
            s.diagonal(w).copy_(s_s + scores.diagonal(w))

        return s[0].gather(0, lens.unsqueeze(0)).sum()


class CRF2oDependency(nn.Module):
    r"""
    Second-order TreeCRF for calculating partition functions and marginals in :math:`O(n^3)` for projective dependency trees.

    References:
        - Yu Zhang, Zhenghua Li and Min Zhang. 2020.
          `Efficient Second-Order TreeCRF for Neural Dependency Parsing`_.

    .. _Efficient Second-Order TreeCRF for Neural Dependency Parsing:
        https://www.aclweb.org/anthology/2020.acl-main.302/
    """

    def __init__(self):
        super().__init__()
        self.criterion = nn.CrossEntropyLoss()

    @torch.enable_grad()
    def forward(self, scores, mask, target=None, mbr=True, partial=False):
        r"""
        Args:
            scores (~torch.Tensor, ~torch.Tensor):
                Tuple of two tensors `s_arc` and `s_sib`.
                `s_arc` (``[batch_size, seq_len, seq_len]``) holds Scores of all possible dependent-head pairs.
                `s_sib` (``[batch_size, seq_len, seq_len, seq_len]``) holds the scores of dependent-head-sibling triples.
            mask (~torch.BoolTensor): ``[batch_size, seq_len]``.
                The mask to avoid aggregation on padding tokens.
                The first column serving as pseudo words for roots should be ``False``.
            target (~torch.LongTensor): ``[batch_size, seq_len]``.
                Tensors of gold-standard dependent-head pairs and dependent-head-sibling triples.
                If partially annotated, the unannotated positions should be filled with -1.
                Default: ``None``.
            mbr (bool):
                If ``True``, marginals will be returned to perform minimum Bayes-risk (MBR) decoding. Default: ``False``.
            partial (bool):
                ``True`` indicates that the trees are partially annotated. Default: ``False``.

        Returns:
            ~torch.Tensor, ~torch.Tensor:
                The first is the training loss averaged by the number of tokens, which won't be returned if ``target=None``.
                The second is a tensor of shape ``[batch_size, seq_len, seq_len]``, in which are marginals if ``mbr=True``,
                or original scores otherwise.
        """

        s_arc, s_sib = scores
        training = s_arc.requires_grad
        batch_size, seq_len, _ = s_arc.shape
        # always enable the gradient computation of scores in order for the computation of marginals
        logZ = self.inside((s.requires_grad_() for s in scores), mask)
        # marginals are used for decoding, and can be computed by combining the inside pass and autograd mechanism
        probs = s_arc
        if mbr:
            probs, = autograd.grad(logZ, s_arc, retain_graph=training)

        if target is None:
            return probs
        arcs, sibs = target
        # the second inside process is needed if use partial annotation
        if partial:
            score = self.inside(scores, mask, arcs)
        else:
            arc_seq, sib_seq = arcs[mask], sibs[mask]
            arc_mask, sib_mask = mask, sib_seq.gt(0)
            sib_seq = sib_seq[sib_mask]
            s_sib = s_sib[mask][torch.arange(len(arc_seq)), arc_seq]
            s_arc = s_arc[arc_mask].gather(-1, arc_seq.unsqueeze(-1))
            s_sib = s_sib[sib_mask].gather(-1, sib_seq.unsqueeze(-1))
            score = s_arc.sum() + s_sib.sum()
        loss = (logZ - score) / mask.sum()

        return loss, probs

    def inside(self, scores, mask, cands=None):
        # the end position of each sentence in a batch
        lens = mask.sum(1)
        s_arc, s_sib = scores
        batch_size, seq_len, _ = s_arc.shape
        # [seq_len, seq_len, batch_size]
        s_arc = s_arc.permute(2, 1, 0)
        # [seq_len, seq_len, seq_len, batch_size]
        s_sib = s_sib.permute(2, 1, 3, 0)
        s_i = torch.full_like(s_arc, float('-inf'))
        s_s = torch.full_like(s_arc, float('-inf'))
        s_c = torch.full_like(s_arc, float('-inf'))
        s_c.diagonal().fill_(0)

        # set the scores of arcs excluded by cands to -inf
        if cands is not None:
            mask = mask.index_fill(1, lens.new_tensor(0), 1)
            mask = (mask.unsqueeze(1) & mask.unsqueeze(-1)).permute(2, 1, 0)
            cands = cands.unsqueeze(-1).index_fill(1, lens.new_tensor(0), -1)
            cands = cands.eq(lens.new_tensor(range(seq_len))) | cands.lt(0)
            cands = cands.permute(2, 1, 0) & mask
            s_arc = s_arc.masked_fill(~cands, float('-inf'))

        for w in range(1, seq_len):
            # n denotes the number of spans to iterate,
            # from span (0, w) to span (n, n+w) given width w
            n = seq_len - w
            # I(j->i) = logsum(exp(I(j->r) + S(j->r, i)) +, i < r < j
            #                  exp(C(j->j) + C(i->j-1)))
            #           + s(j->i)
            # [n, w, batch_size]
            il = stripe(s_i, n, w, (w, 1)) + stripe(s_s, n, w, (1, 0), 0)
            il += stripe(s_sib[range(w, n + w), range(n)], n, w, (0, 1))
            # [n, 1, batch_size]
            il0 = stripe(s_c, n, 1, (w, w)) + stripe(s_c, n, 1, (0, w - 1))
            # il0[0] are set to zeros since the scores of the complete spans starting from 0 are always -inf
            il[:, -1] = il0.index_fill_(0, lens.new_tensor(0), 0).squeeze(1)
            if il.requires_grad:
                il.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0))
            il = il.permute(2, 0, 1).logsumexp(-1)
            s_i.diagonal(-w).copy_(il + s_arc.diagonal(-w))
            # I(i->j) = logsum(exp(I(i->r) + S(i->r, j)) +, i < r < j
            #                  exp(C(i->i) + C(j->i+1)))
            #           + s(i->j)
            # [n, w, batch_size]
            ir = stripe(s_i, n, w) + stripe(s_s, n, w, (0, w), 0)
            ir += stripe(s_sib[range(n), range(w, n + w)], n, w)
            ir[0] = float('-inf')
            # [n, 1, batch_size]
            ir0 = stripe(s_c, n, 1) + stripe(s_c, n, 1, (w, 1))
            ir[:, 0] = ir0.squeeze(1)
            if ir.requires_grad:
                ir.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0))
            ir = ir.permute(2, 0, 1).logsumexp(-1)
            s_i.diagonal(w).copy_(ir + s_arc.diagonal(w))

            # [n, w, batch_size]
            slr = stripe(s_c, n, w) + stripe(s_c, n, w, (w, 1))
            if slr.requires_grad:
                slr.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0))
            slr = slr.permute(2, 0, 1).logsumexp(-1)
            # S(j, i) = logsumexp(C(i->r) + C(j->r+1)), i <= r < j
            s_s.diagonal(-w).copy_(slr)
            # S(i, j) = logsumexp(C(i->r) + C(j->r+1)), i <= r < j
            s_s.diagonal(w).copy_(slr)

            # C(j->i) = logsumexp(C(r->i) + I(j->r)), i <= r < j
            cl = stripe(s_c, n, w, (0, 0), 0) + stripe(s_i, n, w, (w, 0))
            cl.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0))
            s_c.diagonal(-w).copy_(cl.permute(2, 0, 1).logsumexp(-1))
            # C(i->j) = logsumexp(I(i->r) + C(r->j)), i < r <= j
            cr = stripe(s_i, n, w, (0, 1)) + stripe(s_c, n, w, (1, w), 0)
            cr.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0))
            s_c.diagonal(w).copy_(cr.permute(2, 0, 1).logsumexp(-1))
            # disable multi words to modify the root
            s_c[0, w][lens.ne(w)] = float('-inf')

        return s_c[0].gather(0, lens.unsqueeze(0)).sum()

    def loss(self, s_arc, s_sib, s_rel, arcs, sibs, rels, mask, mbr=True, partial=False):
        r"""
        Args:
            s_arc (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
                Scores of all possible arcs.
            s_sib (~torch.Tensor): ``[batch_size, seq_len, seq_len, seq_len]``.
                Scores of all possible dependent-head-sibling triples.
            s_rel (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``.
                Scores of all possible labels on each arc.
            arcs (~torch.LongTensor): ``[batch_size, seq_len]``.
                The tensor of gold-standard arcs.
            sibs (~torch.LongTensor): ``[batch_size, seq_len]``.
                The tensor of gold-standard siblings.
            rels (~torch.LongTensor): ``[batch_size, seq_len]``.
                The tensor of gold-standard labels.
            mask (~torch.BoolTensor): ``[batch_size, seq_len]``.
                The mask for covering the unpadded tokens.
            mbr (bool):
                If ``True``, returns marginals for MBR decoding. Default: ``True``.
            partial (bool):
                ``True`` denotes the trees are partially annotated. Default: ``False``.

        Returns:
            ~torch.Tensor, ~torch.Tensor:
                The training loss and
                original arc scores of shape ``[batch_size, seq_len, seq_len]`` if ``mbr=False``, or marginals otherwise.
        """

        scores, target = (s_arc, s_sib), (arcs, sibs)
        arc_loss, arc_probs = self.forward(scores, mask, target, mbr, partial)
        # -1 denotes un-annotated arcs
        if partial:
            mask = mask & arcs.ge(0)
        s_rel, rels = s_rel[mask], rels[mask]
        s_rel = s_rel[torch.arange(len(rels)), arcs[mask]]
        rel_loss = self.criterion(s_rel, rels)
        loss = arc_loss + rel_loss
        return loss, arc_probs

    # def decode(self, s_arc, s_rel, mask, tree=False, proj=False, alg=None):
    #     r"""
    #     Args:
    #         s_arc (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
    #             Scores of all possible arcs.
    #         s_rel (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``.
    #             Scores of all possible labels on each arc.
    #         mask (~torch.BoolTensor): ``[batch_size, seq_len]``.
    #             The mask for covering the unpadded tokens.
    #         tree (bool):
    #             If ``True``, ensures to output well-formed trees. Default: ``False``.
    #         proj (bool):
    #             If ``True``, ensures to output projective trees. Default: ``False``.
    # 
    #     Returns:
    #         ~torch.Tensor, ~torch.Tensor:
    #             Predicted arcs and labels of shape ``[batch_size, seq_len]``.
    #     """
    # 
    #     lens = mask.sum(1)
    #     arc_preds = s_arc.argmax(-1)
    #     if tree and not alg:
    #         bad = [not istree(seq[1:i + 1], proj)
    #                for i, seq in zip(lens.tolist(), arc_preds.tolist())]
    #         if any(bad):
    #             alg = eisner if proj else mst
    #             arc_preds[bad] = alg(s_arc[bad], mask[bad])
    #     rel_preds = s_rel.argmax(-1).gather(-1, arc_preds.unsqueeze(-1)).squeeze(-1)
    # 
    #     return arc_preds, rel_preds
    def decode(self, s_arc, s_sib, s_rel, mask, tree=False, mbr=True, proj=False):
        r"""
        Args:
            s_arc (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
                Scores of all possible arcs.
            s_sib (~torch.Tensor): ``[batch_size, seq_len, seq_len, seq_len]``.
                Scores of all possible dependent-head-sibling triples.
            s_rel (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``.
                Scores of all possible labels on each arc.
            mask (~torch.BoolTensor): ``[batch_size, seq_len]``.
                The mask for covering the unpadded tokens.
            tree (bool):
                If ``True``, ensures to output well-formed trees. Default: ``False``.
            mbr (bool):
                If ``True``, performs MBR decoding. Default: ``True``.
            proj (bool):
                If ``True``, ensures to output projective trees. Default: ``False``.

        Returns:
            ~torch.Tensor, ~torch.Tensor:
                Predicted arcs and labels of shape ``[batch_size, seq_len]``.
        """

        lens = mask.sum(1)
        arc_preds = s_arc.argmax(-1)
        if tree:
            bad = [not istree(seq[1:i + 1], proj)
                   for i, seq in zip(lens.tolist(), arc_preds.tolist())]
            if any(bad):
                if proj and not mbr:
                    arc_preds = eisner2o((s_arc, s_sib), mask)
                else:
                    alg = eisner if proj else mst
                    arc_preds[bad] = alg(s_arc[bad], mask[bad])
        rel_preds = s_rel.argmax(-1).gather(-1, arc_preds.unsqueeze(-1)).squeeze(-1)

        return arc_preds, rel_preds


================================================
FILE: hanlp/components/parsers/parse_alg.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-04-02 23:20
from collections import defaultdict

import hanlp.utils.span_util
from hanlp.components.parsers.chu_liu_edmonds import decode_mst
import numpy as np


class Tarjan:
    """Computes Tarjan's algorithm for finding strongly connected components (cycles) of a graph"""

    def __init__(self, prediction, tokens):
        """

        Parameters
        ----------
        prediction : numpy.ndarray
            a predicted dependency tree where prediction[dep_idx] = head_idx
        tokens : numpy.ndarray
            the tokens we care about (i.e. exclude _GO, _EOS, and _PAD)
        """
        self._edges = defaultdict(set)
        self._vertices = set((0,))
        for dep, head in enumerate(prediction[tokens]):
            self._vertices.add(dep + 1)
            self._edges[head].add(dep + 1)
        self._indices = {}
        self._lowlinks = {}
        self._onstack = defaultdict(lambda: False)
        self._SCCs = []

        index = 0
        stack = []
        for v in self.vertices:
            if v not in self.indices:
                self.strongconnect(v, index, stack)

    # =============================================================
    def strongconnect(self, v, index, stack):
        """

        Args:
          v: 
          index: 
          stack: 

        Returns:

        """

        self._indices[v] = index
        self._lowlinks[v] = index
        index += 1
        stack.append(v)
        self._onstack[v] = True
        for w in self.edges[v]:
            if w not in self.indices:
                self.strongconnect(w, index, stack)
                self._lowlinks[v] = min(self._lowlinks[v], self._lowlinks[w])
            elif self._onstack[w]:
                self._lowlinks[v] = min(self._lowlinks[v], self._indices[w])

        if self._lowlinks[v] == self._indices[v]:
            self._SCCs.append(set())
            while stack[-1] != v:
                w = stack.pop()
                self._onstack[w] = False
                self._SCCs[-1].add(w)
            w = stack.pop()
            self._onstack[w] = False
            self._SCCs[-1].add(w)
        return

    # ======================
    @property
    def edges(self):
        return self._edges

    @property
    def vertices(self):
        return self._vertices

    @property
    def indices(self):
        return self._indices

    @property
    def SCCs(self):
        return self._SCCs


class UnionFind(object):

    def __init__(self, n) -> None:
        super().__init__()
        self.parent = [x for x in range(n)]
        self.height = [0] * n

    def find(self, x):
        if self.parent[x] == x:
            return x
        self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def unite(self, x, y):
        x = self.find(x)
        y = self.find(y)
        if x == y:
            return
        if self.height[x] < self.height[y]:
            self.parent[x] = y
        else:
            self.parent[y] = x
            if self.height[x] == self.height[y]:
                self.height[x] += 1

    def same(self, x, y):
        return self.find(x) == self.find(y)


def tarjan(parse_probs, length, tokens_to_keep, ensure_tree=True):
    """Adopted from Timothy Dozat https://github.com/tdozat/Parser/blob/master/lib/models/nn.py

    Args:
      parse_probs(NDArray): seq_len x seq_len, the probability of arcs
      length(NDArray): sentence length including ROOT
      tokens_to_keep(NDArray): mask matrix
      ensure_tree:  (Default value = True)

    Returns:

    
    """
    if ensure_tree:
        parse_preds, parse_probs, tokens = unique_root(parse_probs, tokens_to_keep, length)
        # remove cycles
        tarjan = Tarjan(parse_preds, tokens)
        for SCC in tarjan.SCCs:
            if len(SCC) > 1:
                dependents = set()
                to_visit = set(SCC)
                while len(to_visit) > 0:
                    node = to_visit.pop()
                    if not node in dependents:
                        dependents.add(node)
                        to_visit.update(tarjan.edges[node])
                # The indices of the nodes that participate in the cycle
                cycle = np.array(list(SCC))
                # The probabilities of the current heads
                old_heads = parse_preds[cycle]
                old_head_probs = parse_probs[cycle, old_heads]
                # Set the probability of depending on a non-head to zero
                non_heads = np.array(list(dependents))
                parse_probs[np.repeat(cycle, len(non_heads)), np.repeat([non_heads], len(cycle), axis=0).flatten()] = 0
                # Get new potential heads and their probabilities
                new_heads = np.argmax(parse_probs[cycle][:, tokens], axis=1) + 1
                new_head_probs = parse_probs[cycle, new_heads] / old_head_probs
                # Select the most probable change
                change = np.argmax(new_head_probs)
                changed_cycle = cycle[change]
                old_head = old_heads[change]
                new_head = new_heads[change]
                # Make the change
                parse_preds[changed_cycle] = new_head
                tarjan.edges[new_head].add(changed_cycle)
                tarjan.edges[old_head].remove(changed_cycle)
        return parse_preds
    else:
        # block and pad heads
        parse_probs = parse_probs * tokens_to_keep
        parse_preds = np.argmax(parse_probs, axis=1)
        return parse_preds


def chu_liu_edmonds(parse_probs, length):
    tree = decode_mst(hanlp.utils.span_util.T, length, False)[0]
    tree[0] = 0
    return tree


def unique_root(parse_probs, tokens_to_keep: np.ndarray, length):
    I = np.eye(len(tokens_to_keep))
    # block loops and pad heads
    if tokens_to_keep.ndim == 1:
        tokens_to_keep = np.expand_dims(tokens_to_keep, -1)
    parse_probs = parse_probs * tokens_to_keep * (1 - I)
    parse_preds = np.argmax(parse_probs, axis=1)
    tokens = np.arange(1, length)
    roots = np.where(parse_preds[tokens] == 0)[0] + 1
    # ensure at least one root
    if len(roots) < 1:
        # The current root probabilities
        root_probs = parse_probs[tokens, 0]
        # The current head probabilities
        old_head_probs = parse_probs[tokens, parse_preds[tokens]]
        # Get new potential root probabilities
        new_root_probs = root_probs / old_head_probs
        # Select the most probable root
        new_root = tokens[np.argmax(new_root_probs)]
        # Make the change
        parse_preds[new_root] = 0
    # ensure at most one root
    elif len(roots) > 1:
        # The probabilities of the current heads
        root_probs = parse_probs[roots, 0]
        # Set the probability of depending on the root zero
        parse_probs[roots, 0] = 0
        # Get new potential heads and their probabilities
        new_heads = np.argmax(parse_probs[roots][:, tokens], axis=1) + 1
        new_head_probs = parse_probs[roots, new_heads] / root_probs
        # Select the most probable root
        new_root = roots[np.argmin(new_head_probs)]
        # Make the change
        parse_preds[roots] = new_heads
        parse_preds[new_root] = 0
    return parse_preds, parse_probs, tokens


def dfs(graph, start, end):
    fringe = [(start, [])]
    while fringe:
        state, path = fringe.pop()
        if path and state == end:
            yield path
            continue
        for next_state in graph[state]:
            if next_state in path:
                continue
            fringe.append((next_state, path + [next_state]))


def mst_then_greedy(arc_scores, rel_scores, mask, root_rel_idx, rel_idx=None):
    from scipy.special import softmax
    from scipy.special import expit as sigmoid
    length = sum(mask) + 1
    mask = mask[:length]
    arc_scores = arc_scores[:length, :length]
    arc_pred = arc_scores > 0
    arc_probs = sigmoid(arc_scores)
    rel_scores = rel_scores[:length, :length, :]
    rel_probs = softmax(rel_scores, -1)
    if not any(arc_pred[:, 0][1:]):  # no root
        root = np.argmax(rel_probs[1:, 0, root_rel_idx]) + 1
        arc_probs[root, 0] = 1
    parse_preds, parse_probs, tokens = unique_root(arc_probs, mask, length)
    root = adjust_root_score(arc_scores, parse_preds, root_rel_idx, rel_scores)
    tree = chu_liu_edmonds(arc_scores, length)
    if rel_idx is not None:  # Unknown DEPREL label: 'ref'
        rel_scores[np.arange(len(tree)), tree, rel_idx] = -float('inf')
    return tree, add_secondary_arcs_by_scores(arc_scores, rel_scores, tree, root_rel_idx)


def adjust_root_score(arc_scores, parse_preds, root_rel_idx, rel_scores=None):
    root = np.where(parse_preds[1:] == 0)[0] + 1
    arc_scores[:, 0] = min(np.min(arc_scores), -1000)
    arc_scores[root, 0] = max(np.max(arc_scores), 1000)
    if rel_scores is not None:
        rel_scores[:, :, root_rel_idx] = -float('inf')
        rel_scores[root, 0, root_rel_idx] = float('inf')
    return root


def add_secondary_arcs_by_scores(arc_scores, rel_scores, tree, root_rel_idx, arc_preds=None):
    if not isinstance(tree, np.ndarray):
        tree = np.array(tree)
    if arc_preds is None:
        arc_preds = arc_scores > 0
    rel_pred = np.argmax(rel_scores, axis=-1)

    return add_secondary_arcs_by_preds(arc_scores, arc_preds, rel_pred, tree, root_rel_idx)


def add_secondary_arcs_by_preds(arc_scores, arc_preds, rel_preds, tree, root_rel_idx=None):
    dh = np.argwhere(arc_preds)
    sdh = sorted([(arc_scores[x[0], x[1]], list(x)) for x in dh], reverse=True)
    graph = [[] for _ in range(len(tree))]
    for d, h in enumerate(tree):
        if d:
            graph[h].append(d)
    for s, (d, h) in sdh:
        if not d or not h or d in graph[h]:
            continue
        try:
            path = next(dfs(graph, d, h))
        except StopIteration:
            # no path from d to h
            graph[h].append(d)
    parse_graph = [[] for _ in range(len(tree))]
    num_root = 0
    for h in range(len(tree)):
        for d in graph[h]:
            rel = rel_preds[d, h]
            if h == 0 and root_rel_idx is not None:
                rel = root_rel_idx
                assert num_root == 0
                num_root += 1
            parse_graph[d].append((h, rel))
        parse_graph[d] = sorted(parse_graph[d])
    return parse_graph


def adjust_root_score_then_add_secondary_arcs(arc_scores, rel_scores, tree, root_rel_idx):
    if len(arc_scores) != tree:
        arc_scores = arc_scores[:len(tree), :len(tree)]
        rel_scores = rel_scores[:len(tree), :len(tree), :]
    parse_preds = arc_scores > 0
    # adjust_root_score(arc_scores, parse_preds, rel_scores)
    parse_preds[:, 0] = False  # set heads to False
    rel_scores[:, :, root_rel_idx] = -float('inf')
    return add_secondary_arcs_by_scores(arc_scores, rel_scores, tree, root_rel_idx, parse_preds)


================================================
FILE: hanlp/components/parsers/ud/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-14 20:34


================================================
FILE: hanlp/components/parsers/ud/lemma_edit.py
================================================
"""
Utilities for processing lemmas

Adopted from UDPipe Future
https://github.com/CoNLL-UD-2018/UDPipe-Future
"""


def min_edit_script(source, target, allow_copy=False):
    """Finds the minimum edit script to transform the source to the target

    Args:
      source: 
      target: 
      allow_copy:  (Default value = False)

    Returns:

    """
    a = [[(len(source) + len(target) + 1, None)] * (len(target) + 1) for _ in range(len(source) + 1)]
    for i in range(0, len(source) + 1):
        for j in range(0, len(target) + 1):
            if i == 0 and j == 0:
                a[i][j] = (0, "")
            else:
                if allow_copy and i and j and source[i - 1] == target[j - 1] and a[i - 1][j - 1][0] < a[i][j][0]:
                    a[i][j] = (a[i - 1][j - 1][0], a[i - 1][j - 1][1] + "→")
                if i and a[i - 1][j][0] < a[i][j][0]:
                    a[i][j] = (a[i - 1][j][0] + 1, a[i - 1][j][1] + "-")
                if j and a[i][j - 1][0] < a[i][j][0]:
                    a[i][j] = (a[i][j - 1][0] + 1, a[i][j - 1][1] + "+" + target[j - 1])
    return a[-1][-1][1]


def gen_lemma_rule(form, lemma, allow_copy=False):
    """Generates a lemma rule to transform the source to the target

    Args:
      form: 
      lemma: 
      allow_copy:  (Default value = False)

    Returns:

    """
    form = form.lower()

    previous_case = -1
    lemma_casing = ""
    for i, c in enumerate(lemma):
        case = "↑" if c.lower() != c else "↓"
        if case != previous_case:
            lemma_casing += "{}{}{}".format("¦" if lemma_casing else "", case,
                                            i if i <= len(lemma) // 2 else i - len(lemma))
        previous_case = case
    lemma = lemma.lower()

    best, best_form, best_lemma = 0, 0, 0
    for l in range(len(lemma)):
        for f in range(len(form)):
            cpl = 0
            while f + cpl < len(form) and l + cpl < len(lemma) and form[f + cpl] == lemma[l + cpl]: cpl += 1
            if cpl > best:
                best = cpl
                best_form = f
                best_lemma = l

    rule = lemma_casing + ";"
    if not best:
        rule += "a" + lemma
    else:
        rule += "d{}¦{}".format(
            min_edit_script(form[:best_form], lemma[:best_lemma], allow_copy),
            min_edit_script(form[best_form + best:], lemma[best_lemma + best:], allow_copy),
        )
    return rule


def apply_lemma_rule(form, lemma_rule):
    """Applies the lemma rule to the form to generate the lemma

    Args:
      form: 
      lemma_rule: 

    Returns:

    """
    cells = lemma_rule.split(";", 1)
    if len(cells) == 1:  # Some predicted lemma rules are _, which might be due to partial annotation
        return form.lower()
    casing, rule = cells
    if rule.startswith("a"):
        lemma = rule[1:]
    else:
        form = form.lower()
        rules, rule_sources = rule[1:].split("¦"), []
        assert len(rules) == 2
        for rule in rules:
            source, i = 0, 0
            while i < len(rule):
                if rule[i] == "→" or rule[i] == "-":
                    source += 1
                else:
                    assert rule[i] == "+"
                    i += 1
                i += 1
            rule_sources.append(source)

        try:
            lemma, form_offset = "", 0
            for i in range(2):
                j, offset = 0, (0 if i == 0 else len(form) - rule_sources[1])
                while j < len(rules[i]):
                    if rules[i][j] == "→":
                        lemma += form[offset]
                        offset += 1
                    elif rules[i][j] == "-":
                        offset += 1
                    else:
                        assert (rules[i][j] == "+")
                        lemma += rules[i][j + 1]
                        j += 1
                    j += 1
                if i == 0:
                    lemma += form[rule_sources[0]: len(form) - rule_sources[1]]
        except:
            lemma = form

    for rule in casing.split("¦"):
        if rule == "↓0": continue  # The lemma is lowercased initially
        case, offset = rule[0], int(rule[1:])
        lemma = lemma[:offset] + (lemma[offset:].upper() if case == "↑" else lemma[offset:].lower())

    return lemma


================================================
FILE: hanlp/components/parsers/ud/tag_decoder.py
================================================
# This file is modified from udify, which is licensed under the MIT license:
# MIT License
#
# Copyright (c) 2019 Dan Kondratyuk
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

"""
Decodes sequences of tags, e.g., POS tags, given a list of contextualized word embeddings
"""

from typing import Dict

import numpy
import torch
import torch.nn.functional as F
from torch.nn.modules.adaptive import AdaptiveLogSoftmaxWithLoss
from torch.nn.modules.linear import Linear

from hanlp.components.parsers.ud.lemma_edit import apply_lemma_rule
from hanlp.components.parsers.ud.udify_util import sequence_cross_entropy, sequence_cross_entropy_with_logits


class TagDecoder(torch.nn.Module):
    """A basic sequence tagger that decodes from inputs of word embeddings"""

    def __init__(self,
                 input_dim,
                 num_classes,
                 label_smoothing: float = 0.03,
                 adaptive: bool = False) -> None:
        super(TagDecoder, self).__init__()

        self.label_smoothing = label_smoothing
        self.num_classes = num_classes
        self.adaptive = adaptive

        if self.adaptive:
            adaptive_cutoffs = [round(self.num_classes / 15), 3 * round(self.num_classes / 15)]
            self.task_output = AdaptiveLogSoftmaxWithLoss(input_dim,
                                                          self.num_classes,
                                                          cutoffs=adaptive_cutoffs,
                                                          div_value=4.0)
        else:
            self.task_output = Linear(self.output_dim, self.num_classes)

    def forward(self,
                encoded_text: torch.FloatTensor,
                mask: torch.LongTensor,
                gold_tags: torch.LongTensor,
                ) -> Dict[str, torch.Tensor]:
        hidden = encoded_text

        batch_size, sequence_length, _ = hidden.size()
        output_dim = [batch_size, sequence_length, self.num_classes]

        loss_fn = self._adaptive_loss if self.adaptive else self._loss

        output_dict = loss_fn(hidden, mask, gold_tags, output_dim)

        return output_dict

    def _adaptive_loss(self, hidden, mask, gold_tags, output_dim):
        logits = hidden
        reshaped_log_probs = logits.reshape(-1, logits.size(2))

        class_probabilities = self.task_output.log_prob(reshaped_log_probs).view(output_dim)

        output_dict = {"logits": logits, "class_probabilities": class_probabilities}

        if gold_tags is not None:
            output_dict["loss"] = sequence_cross_entropy(class_probabilities,
                                                         gold_tags,
                                                         mask,
                                                         label_smoothing=self.label_smoothing)

        return output_dict

    def _loss(self, hidden, mask, gold_tags, output_dim):
        logits = self.task_output(hidden)
        reshaped_log_probs = logits.view(-1, self.num_classes)
        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view(output_dim)

        output_dict = {"logits": logits, "class_probabilities": class_probabilities}

        if gold_tags is not None:
            output_dict["loss"] = sequence_cross_entropy_with_logits(logits,
                                                                     gold_tags,
                                                                     mask,
                                                                     label_smoothing=self.label_smoothing)
        return output_dict

    def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        all_words = output_dict["words"]

        all_predictions = output_dict["class_probabilities"][self.task].cpu().data.numpy()
        if all_predictions.ndim == 3:
            predictions_list = [all_predictions[i] for i in range(all_predictions.shape[0])]
        else:
            predictions_list = [all_predictions]
        all_tags = []
        for predictions, words in zip(predictions_list, all_words):
            argmax_indices = numpy.argmax(predictions, axis=-1)
            tags = [self.vocab.get_token_from_index(x, namespace=self.task)
                    for x in argmax_indices]

            if self.task == "lemmas":
                def decode_lemma(word, rule):
                    if rule == "_":
                        return "_"
                    if rule == "@@UNKNOWN@@":
                        return word
                    return apply_lemma_rule(word, rule)

                tags = [decode_lemma(word, rule) for word, rule in zip(words, tags)]

            all_tags.append(tags)
        output_dict[self.task] = all_tags

        return output_dict


================================================
FILE: hanlp/components/parsers/ud/ud_model.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-15 14:21

from typing import Dict, Any

import torch

from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser
from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDecoder
from hanlp.components.parsers.ud.tag_decoder import TagDecoder
from hanlp.layers.embeddings.contextual_word_embedding import ContextualWordEmbeddingModule
from hanlp.layers.scalar_mix import ScalarMixWithDropout


class UniversalDependenciesModel(torch.nn.Module):
    def __init__(self,
                 encoder: ContextualWordEmbeddingModule,
                 n_mlp_arc,
                 n_mlp_rel,
                 mlp_dropout,
                 num_rels,
                 num_lemmas,
                 num_upos,
                 num_feats,
                 mix_embedding: int = 13,
                 layer_dropout: int = 0.0):
        super().__init__()
        self.encoder = encoder
        self.decoder = UniversalDependenciesDecoder(
            encoder.get_output_dim(),
            n_mlp_arc,
            n_mlp_rel,
            mlp_dropout,
            num_rels,
            num_lemmas,
            num_upos,
            num_feats,
            mix_embedding,
            layer_dropout
        )

    def forward(self,
                batch: Dict[str, torch.Tensor],
                mask,
                ):
        hidden = self.encoder(batch)
        return self.decoder(hidden, batch=batch, mask=mask)


class UniversalDependenciesDecoder(torch.nn.Module):
    def __init__(self,
                 hidden_size,
                 n_mlp_arc,
                 n_mlp_rel,
                 mlp_dropout,
                 num_rels,
                 num_lemmas,
                 num_upos,
                 num_feats,
                 mix_embedding: int = 13,
                 layer_dropout: int = 0.0,
                 ) -> None:
        super(UniversalDependenciesDecoder, self).__init__()

        # decoders
        self.decoders = torch.nn.ModuleDict({
            'lemmas': TagDecoder(hidden_size, num_lemmas, label_smoothing=0.03, adaptive=True),
            'upos': TagDecoder(hidden_size, num_upos, label_smoothing=0.03, adaptive=True),
            'deps': BiaffineDecoder(hidden_size, n_mlp_arc, n_mlp_rel, mlp_dropout, num_rels),
            'feats': TagDecoder(hidden_size, num_feats, label_smoothing=0.03, adaptive=True),
        })
        self.gold_keys = {
            'lemmas': 'lemma_id',
            'upos': 'pos_id',
            'feats': 'feat_id',
        }

        if mix_embedding:
            self.scalar_mix = torch.nn.ModuleDict({
                task: ScalarMixWithDropout((1, mix_embedding),
                                           do_layer_norm=False,
                                           dropout=layer_dropout)
                for task in self.decoders
            })
        else:
            self.scalar_mix = None

    def forward(self,
                hidden,
                batch: Dict[str, torch.Tensor],
                mask) -> Dict[str, Any]:
        mask_without_root = mask.clone()
        mask_without_root[:, 0] = False

        logits = {}
        class_probabilities = {}
        output_dict = {"logits": logits,
                       "class_probabilities": class_probabilities}
        loss = 0

        arc = batch.get('arc', None)
        # Run through each of the tasks on the shared encoder and save predictions
        for task in self.decoders:
            if self.scalar_mix:
                decoder_input = self.scalar_mix[task](hidden, mask)
            else:
                decoder_input = hidden

            if task == "deps":
                s_arc, s_rel = self.decoders[task](decoder_input, mask)
                pred_output = {'class_probabilities': {'s_arc': s_arc, 's_rel': s_rel}}
                if arc is not None:
                    # noinspection PyTypeChecker
                    pred_output['loss'] = BiaffineDependencyParser.compute_loss(None, s_arc, s_rel, arc,
                                                                                batch['rel_id'],
                                                                                mask_without_root,
                                                                                torch.nn.functional.cross_entropy)
            else:
                pred_output = self.decoders[task](decoder_input, mask_without_root,
                                                  batch.get(self.gold_keys[task], None))
            if 'logits' in pred_output:
                logits[task] = pred_output["logits"]
            if 'class_probabilities' in pred_output:
                class_probabilities[task] = pred_output["class_probabilities"]
            if 'loss' in pred_output:
                # Keep track of the loss if we have the gold tags available
                loss += pred_output["loss"]

        if arc is not None:
            output_dict["loss"] = loss

        return output_dict

    def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        for task in self.tasks:
            self.decoders[task].decode(output_dict)

        return output_dict


================================================
FILE: hanlp/components/parsers/ud/ud_parser.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-14 20:34
import logging
from copy import deepcopy
from typing import Union, List, Callable

import torch
from torch.utils.data import DataLoader

from hanlp_common.constant import IDX
from hanlp.common.dataset import PadSequenceDataLoader, SortingSamplerBuilder
from hanlp.common.structure import History
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import FieldLength, PunctuationMask
from hanlp.common.vocab import Vocab
from hanlp.components.classifiers.transformer_classifier import TransformerComponent
from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser
from hanlp_common.conll import CoNLLUWord, CoNLLSentence
from hanlp.components.parsers.ud.ud_model import UniversalDependenciesModel
from hanlp.components.parsers.ud.util import generate_lemma_rule, append_bos, sample_form_missing
from hanlp.components.parsers.ud.lemma_edit import apply_lemma_rule
from hanlp.datasets.parsing.loaders.conll_dataset import CoNLLParsingDataset
from hanlp.layers.embeddings.contextual_word_embedding import ContextualWordEmbedding
from hanlp.metrics.accuracy import CategoricalAccuracy
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp.metrics.parsing.attachmentscore import AttachmentScore
from hanlp.utils.time_util import CountdownTimer
from hanlp.utils.torch_util import clip_grad_norm, lengths_to_mask
from hanlp_common.util import merge_locals_kwargs, merge_dict, reorder


class UniversalDependenciesParser(TorchComponent):

    def __init__(self, **kwargs) -> None:
        """Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing) implementation
        of "75 Languages, 1 Model: Parsing Universal Dependencies Universally" (:cite:`kondratyuk-straka-2019-75`).

        Args:
            **kwargs: Predefined config.
        """
        super().__init__(**kwargs)
        self.model: UniversalDependenciesModel = self.model

    def build_dataloader(self,
                         data,
                         batch_size,
                         shuffle=False,
                         device=None,
                         logger: logging.Logger = None,
                         sampler_builder=None,
                         gradient_accumulation=1,
                         transformer: ContextualWordEmbedding = None,
                         **kwargs) -> DataLoader:
        transform = [generate_lemma_rule, append_bos, self.vocabs, transformer.transform(), FieldLength('token')]
        if not self.config.punct:
            transform.append(PunctuationMask('token', 'punct_mask'))
        dataset = self.build_dataset(data, transform)
        if self.vocabs.mutable:
            # noinspection PyTypeChecker
            self.build_vocabs(dataset, logger)
        lens = [len(x['token_input_ids']) for x in dataset]
        if sampler_builder:
            sampler = sampler_builder.build(lens, shuffle, gradient_accumulation)
        else:
            sampler = SortingSamplerBuilder(batch_size).build(lens, shuffle, gradient_accumulation)
        return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler,
                                     pad={'arc': 0}, )

    def build_vocabs(self, trn, logger, **kwargs):
        self.vocabs.pos = Vocab(unk_token=None, pad_token=None)
        self.vocabs.rel = Vocab(unk_token=None, pad_token=None)
        self.vocabs.lemma = Vocab(unk_token=None, pad_token=None)
        self.vocabs.feat = Vocab(unk_token=None, pad_token=None)
        timer = CountdownTimer(len(trn))
        max_seq_len = 0
        for each in trn:
            max_seq_len = max(max_seq_len, len(each['token']))
            timer.log(f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})')
        for v in self.vocabs.values():
            v.set_unk_as_safe_unk()
        self.vocabs.lock()
        self.vocabs.summary(logger)

    def build_dataset(self, data, transform):
        dataset = CoNLLParsingDataset(data, transform=transform, prune=sample_form_missing, cache=isinstance(data, str))
        return dataset

    def build_optimizer(self, trn, **kwargs):
        # noinspection PyCallByClass,PyTypeChecker
        return TransformerComponent.build_optimizer(self, trn, **kwargs)

    def build_criterion(self, **kwargs):
        pass

    def build_metric(self, **kwargs):
        return MetricDict({
            'lemmas': CategoricalAccuracy(),
            'upos': CategoricalAccuracy(),
            'deps': AttachmentScore(),
            'feats': CategoricalAccuracy(),
        })

    def evaluate_dataloader(self,
                            data: DataLoader,
                            criterion: Callable,
                            metric: MetricDict = None,
                            output=False,
                            logger=None,
                            ratio_width=None,
                            **kwargs):

        metric.reset()
        self.model.eval()
        timer = CountdownTimer(len(data))
        total_loss = 0
        for idx, batch in enumerate(data):
            out, mask = self.feed_batch(batch)
            loss = out['loss']
            total_loss += loss.item()
            self.decode_output(out, mask, batch)
            self.update_metrics(metric, batch, out, mask)
            report = f'loss: {total_loss / (idx + 1):.4f} {metric.cstr()}'
            timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width)
            del loss
            del out
            del mask
        return total_loss / len(data), metric

    # noinspection PyMethodOverriding
    def build_model(self,
                    transformer: ContextualWordEmbedding,
                    n_mlp_arc,
                    n_mlp_rel,
                    mlp_dropout,
                    mix_embedding,
                    layer_dropout,
                    training=True,
                    **kwargs) -> torch.nn.Module:
        assert bool(transformer.scalar_mix) == bool(mix_embedding), 'transformer.scalar_mix has to be 1 ' \
                                                                    'when mix_embedding is non-zero.'
        # noinspection PyTypeChecker
        return UniversalDependenciesModel(transformer.module(training=training),
                                          n_mlp_arc,
                                          n_mlp_rel,
                                          mlp_dropout,
                                          len(self.vocabs.rel),
                                          len(self.vocabs.lemma),
                                          len(self.vocabs.pos),
                                          len(self.vocabs.feat),
                                          mix_embedding,
                                          layer_dropout)

    def predict(self, data: Union[List[str], List[List[str]]], batch_size: int = None, **kwargs):
        if not data:
            return []
        flat = self.input_is_flat(data)
        if flat:
            data = [data]
        samples = self.build_samples(data)
        if not batch_size:
            batch_size = self.config.batch_size
        dataloader = self.build_dataloader(samples,
                                           device=self.devices[0], shuffle=False,
                                           **merge_dict(self.config,
                                                        batch_size=batch_size,
                                                        overwrite=True,
                                                        **kwargs))
        order = []
        outputs = []
        for batch in dataloader:
            out, mask = self.feed_batch(batch)
            self.decode_output(out, mask, batch)
            outputs.extend(self.prediction_to_human(out, batch))
            order.extend(batch[IDX])
        outputs = reorder(outputs, order)
        if flat:
            return outputs[0]
        return outputs

    def build_samples(self, data: List[List[str]]):
        return [{'FORM': x} for x in data]

    def fit(self,
            trn_data,
            dev_data,
            save_dir,
            transformer: ContextualWordEmbedding,
            sampler_builder=None,
            mix_embedding: int = 13,
            layer_dropout: int = 0.1,
            n_mlp_arc=768,
            n_mlp_rel=256,
            mlp_dropout=.33,
            lr=1e-3,
            transformer_lr=2.5e-5,
            patience=0.1,
            batch_size=32,
            epochs=30,
            gradient_accumulation=1,
            adam_epsilon=1e-8,
            weight_decay=0,
            warmup_steps=0.1,
            grad_norm=1.0,
            tree=False,
            proj=False,
            punct=False,
            logger=None,
            verbose=True,
            devices: Union[float, int, List[int]] = None, **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
                              logger: logging.Logger, devices, ratio_width=None, patience=0.5, eval_trn=True, **kwargs):
        if isinstance(patience, float):
            patience = int(patience * epochs)
        best_epoch, best_metric = 0, -1
        timer = CountdownTimer(epochs)
        history = History()
        for epoch in range(1, epochs + 1):
            logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
            self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history, ratio_width=ratio_width,
                                eval_trn=eval_trn, **self.config)
            loss, dev_metric = self.evaluate_dataloader(dev, criterion, metric, logger=logger, ratio_width=ratio_width)
            timer.update()
            report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}"
            if dev_metric > best_metric:
                best_epoch, best_metric = epoch, deepcopy(dev_metric)
                self.save_weights(save_dir)
                report += ' [red](saved)[/red]'
            else:
                report += f' ({epoch - best_epoch})'
                if epoch - best_epoch >= patience:
                    report += ' early stop'
            logger.info(report)
            if epoch - best_epoch >= patience:
                break
        if not best_epoch:
            self.save_weights(save_dir)
        elif best_epoch != epoch:
            self.load_weights(save_dir)
        logger.info(f"Max score of dev is {best_metric.cstr()} at epoch {best_epoch}")
        logger.info(f"Average time of each epoch is {timer.elapsed_average_human}")
        logger.info(f"{timer.elapsed_human} elapsed")

    # noinspection PyMethodOverriding
    def fit_dataloader(self,
                       trn: DataLoader,
                       criterion,
                       optimizer,
                       metric: MetricDict,
                       logger: logging.Logger,
                       history: History,
                       gradient_accumulation=1,
                       grad_norm=None,
                       ratio_width=None,
                       eval_trn=True,
                       **kwargs):
        optimizer, scheduler = optimizer
        metric.reset()
        self.model.train()
        timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation))
        total_loss = 0
        for idx, batch in enumerate(trn):
            out, mask = self.feed_batch(batch)
            loss = out['loss']
            if gradient_accumulation and gradient_accumulation > 1:
                loss /= gradient_accumulation
            loss.backward()
            total_loss += loss.item()
            if eval_trn:
                self.decode_output(out, mask, batch)
                self.update_metrics(metric, batch, out, mask)
            if history.step(gradient_accumulation):
                self._step(optimizer, scheduler, grad_norm)
                report = f'loss: {total_loss / (idx + 1):.4f} {metric.cstr()}' if eval_trn \
                    else f'loss: {total_loss / (idx + 1):.4f}'
                timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width)
            del loss
            del out
            del mask

    def decode_output(self, outputs, mask, batch):
        arc_scores, rel_scores = outputs['class_probabilities']['deps']['s_arc'], \
                                 outputs['class_probabilities']['deps']['s_rel']
        arc_preds, rel_preds = BiaffineDependencyParser.decode(self, arc_scores, rel_scores, mask, batch)
        outputs['arc_preds'], outputs['rel_preds'] = arc_preds, rel_preds
        return outputs

    def update_metrics(self, metrics, batch, outputs, mask):
        arc_preds, rel_preds, puncts = outputs['arc_preds'], outputs['rel_preds'], batch.get('punct_mask', None)
        BiaffineDependencyParser.update_metric(self, arc_preds, rel_preds, batch['arc'], batch['rel_id'], mask, puncts,
                                               metrics['deps'], batch)
        for task, key in zip(['lemmas', 'upos', 'feats'], ['lemma_id', 'pos_id', 'feat_id']):
            metric: Metric = metrics[task]
            pred = outputs['class_probabilities'][task]
            gold = batch[key]
            metric(pred.detach(), gold, mask=mask)
        return metrics

    def feed_batch(self, batch: dict):
        mask = self.compute_mask(batch)
        output_dict = self.model(batch, mask)
        if self.model.training:
            mask = mask.clone()
        mask[:, 0] = 0
        return output_dict, mask

    def compute_mask(self, batch):
        lens = batch['token_length']
        mask = lengths_to_mask(lens)
        return mask

    def _step(self, optimizer, scheduler, grad_norm):
        clip_grad_norm(self.model, grad_norm)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    def input_is_flat(self, data):
        # noinspection PyCallByClass,PyTypeChecker
        return BiaffineDependencyParser.input_is_flat(self, data, False)

    def prediction_to_human(self, outputs: dict, batch):
        arcs, rels = outputs['arc_preds'], outputs['rel_preds']
        upos = outputs['class_probabilities']['upos'][:, 1:, :].argmax(-1).tolist()
        feats = outputs['class_probabilities']['feats'][:, 1:, :].argmax(-1).tolist()
        lemmas = outputs['class_probabilities']['lemmas'][:, 1:, :].argmax(-1).tolist()
        lem_vocab = self.vocabs['lemma'].idx_to_token
        pos_vocab = self.vocabs['pos'].idx_to_token
        feat_vocab = self.vocabs['feat'].idx_to_token
        # noinspection PyCallByClass,PyTypeChecker
        for tree, form, lemma, pos, feat in zip(BiaffineDependencyParser.prediction_to_head_rel(
                self, arcs, rels, batch), batch['token'], lemmas, upos, feats):
            form = form[1:]
            assert len(form) == len(tree)
            lemma = [apply_lemma_rule(t, lem_vocab[r]) for t, r in zip(form, lemma)]
            pos = [pos_vocab[x] for x in pos]
            feat = [feat_vocab[x] for x in feat]
            yield CoNLLSentence(
                [CoNLLUWord(id=i + 1, form=fo, lemma=l, upos=p, feats=fe, head=a, deprel=r) for
                 i, (fo, (a, r), l, p, fe) in enumerate(zip(form, tree, lemma, pos, feat))])

    def __call__(self, data, batch_size=None, **kwargs) -> Union[CoNLLSentence, List[CoNLLSentence]]:
        return super().__call__(data, batch_size, **kwargs)


================================================
FILE: hanlp/components/parsers/ud/udify_util.py
================================================
# This file is modified from udify and allennlp, which are licensed under the MIT license:
# MIT License
#
# Copyright (c) 2019 Dan Kondratyuk and allennlp
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
from typing import List, Dict, Tuple, Union

import numpy
import torch


def get_ud_treebank_files(dataset_dir: str, treebanks: List[str] = None) -> Dict[str, Tuple[str, str, str]]:
    """Retrieves all treebank data paths in the given directory.
    Adopted from https://github.com/Hyperparticle/udify
    MIT Licence

    Args:
      dataset_dir: 
      treebanks: 
      dataset_dir: str: 
      treebanks: List[str]:  (Default value = None)

    Returns:

    
    """
    datasets = {}
    treebanks = os.listdir(dataset_dir) if not treebanks else treebanks
    for treebank in treebanks:
        treebank_path = os.path.join(dataset_dir, treebank)
        conllu_files = [file for file in sorted(os.listdir(treebank_path)) if file.endswith(".conllu")]

        train_file = [file for file in conllu_files if file.endswith("train.conllu")]
        dev_file = [file for file in conllu_files if file.endswith("dev.conllu")]
        test_file = [file for file in conllu_files if file.endswith("test.conllu")]

        train_file = os.path.join(treebank_path, train_file[0]) if train_file else None
        dev_file = os.path.join(treebank_path, dev_file[0]) if dev_file else None
        test_file = os.path.join(treebank_path, test_file[0]) if test_file else None

        datasets[treebank] = (train_file, dev_file, test_file)
    return datasets


def sequence_cross_entropy(log_probs: torch.FloatTensor,
                           targets: torch.LongTensor,
                           weights: torch.FloatTensor,
                           average: str = "batch",
                           label_smoothing: float = None) -> torch.FloatTensor:
    if average not in {None, "token", "batch"}:
        raise ValueError("Got average f{average}, expected one of "
                         "None, 'token', or 'batch'")
    # shape : (batch * sequence_length, num_classes)
    log_probs_flat = log_probs.view(-1, log_probs.size(2))
    # shape : (batch * max_len, 1)
    targets_flat = targets.view(-1, 1).long()

    if label_smoothing is not None and label_smoothing > 0.0:
        num_classes = log_probs.size(-1)
        smoothing_value = label_smoothing / num_classes
        # Fill all the correct indices with 1 - smoothing value.
        one_hot_targets = torch.zeros_like(log_probs_flat).scatter_(-1, targets_flat, 1.0 - label_smoothing)
        smoothed_targets = one_hot_targets + smoothing_value
        negative_log_likelihood_flat = - log_probs_flat * smoothed_targets
        negative_log_likelihood_flat = negative_log_likelihood_flat.sum(-1, keepdim=True)
    else:
        # Contribution to the negative log likelihood only comes from the exact indices
        # of the targets, as the target distributions are one-hot. Here we use torch.gather
        # to extract the indices of the num_classes dimension which contribute to the loss.
        # shape : (batch * sequence_length, 1)
        negative_log_likelihood_flat = - torch.gather(log_probs_flat, dim=1, index=targets_flat)
    # shape : (batch, sequence_length)
    negative_log_likelihood = negative_log_likelihood_flat.view(*targets.size())
    # shape : (batch, sequence_length)
    negative_log_likelihood = negative_log_likelihood * weights.float()

    if average == "batch":
        # shape : (batch_size,)
        per_batch_loss = negative_log_likelihood.sum(1) / (weights.sum(1).float() + 1e-13)
        num_non_empty_sequences = ((weights.sum(1) > 0).float().sum() + 1e-13)
        return per_batch_loss.sum() / num_non_empty_sequences
    elif average == "token":
        return negative_log_likelihood.sum() / (weights.sum().float() + 1e-13)
    else:
        # shape : (batch_size,)
        per_batch_loss = negative_log_likelihood.sum(1) / (weights.sum(1).float() + 1e-13)
        return per_batch_loss


def sequence_cross_entropy_with_logits(
        logits: torch.FloatTensor,
        targets: torch.LongTensor,
        weights: Union[torch.FloatTensor, torch.BoolTensor],
        average: str = "batch",
        label_smoothing: float = None,
        gamma: float = None,
        alpha: Union[float, List[float], torch.FloatTensor] = None,
) -> torch.FloatTensor:
    """Computes the cross entropy loss of a sequence, weighted with respect to
    some user provided weights. Note that the weighting here is not the same as
    in the `torch.nn.CrossEntropyLoss()` criterion, which is weighting
    classes; here we are weighting the loss contribution from particular elements
    in the sequence. This allows loss computations for models which use padding.
    
    # Parameters
    
    logits : `torch.FloatTensor`, required.
        A `torch.FloatTensor` of size (batch_size, sequence_length, num_classes)
        which contains the unnormalized probability for each class.
    targets : `torch.LongTensor`, required.
        A `torch.LongTensor` of size (batch, sequence_length) which contains the
        index of the true class for each corresponding step.
    weights : `Union[torch.FloatTensor, torch.BoolTensor]`, required.
        A `torch.FloatTensor` of size (batch, sequence_length)
    average: `str`, optional (default = `"batch"`)
        If "batch", average the loss across the batches. If "token", average
        the loss across each item in the input. If `None`, return a vector
        of losses per batch element.
    label_smoothing : `float`, optional (default = `None`)
        Whether or not to apply label smoothing to the cross-entropy loss.
        For example, with a label smoothing value of 0.2, a 4 class classification
        target would look like `[0.05, 0.05, 0.85, 0.05]` if the 3rd class was
        the correct label.
    gamma : `float`, optional (default = `None`)
        Focal loss[*] focusing parameter `gamma` to reduces the relative loss for
        well-classified examples and put more focus on hard. The greater value
        `gamma` is, the more focus on hard examples.
    alpha : `Union[float, List[float]]`, optional (default = `None`)
        Focal loss[*] weighting factor `alpha` to balance between classes. Can be
        used independently with `gamma`. If a single `float` is provided, it
        is assumed binary case using `alpha` and `1 - alpha` for positive and
        negative respectively. If a list of `float` is provided, with the same
        length as the number of classes, the weights will match the classes.
        [*] T. Lin, P. Goyal, R. Girshick, K. He and P. Dollár, "Focal Loss for
        Dense Object Detection," 2017 IEEE International Conference on Computer
        Vision (ICCV), Venice, 2017, pp. 2999-3007.
    
    # Returns
    
    `torch.FloatTensor`
        A torch.FloatTensor representing the cross entropy loss.
        If `average=="batch"` or `average=="token"`, the returned loss is a scalar.
        If `average is None`, the returned loss is a vector of shape (batch_size,).

    Args:
      logits: torch.FloatTensor: 
      targets: torch.LongTensor: 
      weights: Union[torch.FloatTensor: 
      torch.BoolTensor]: 
      average: str:  (Default value = "batch")
      label_smoothing: float:  (Default value = None)
      gamma: float:  (Default value = None)
      alpha: Union[float: 
      List[float]: 
      torch.FloatTensor]:  (Default value = None)

    Returns:

    """
    if average not in {None, "token", "batch"}:
        raise ValueError("Got average f{average}, expected one of None, 'token', or 'batch'")

    # make sure weights are float
    weights = weights.to(logits.dtype)
    # sum all dim except batch
    non_batch_dims = tuple(range(1, len(weights.shape)))
    # shape : (batch_size,)
    weights_batch_sum = weights.sum(dim=non_batch_dims)
    # shape : (batch * sequence_length, num_classes)
    logits_flat = logits.view(-1, logits.size(-1))
    # shape : (batch * sequence_length, num_classes)
    log_probs_flat = torch.nn.functional.log_softmax(logits_flat, dim=-1)
    # shape : (batch * max_len, 1)
    targets_flat = targets.view(-1, 1).long()
    # focal loss coefficient
    if gamma:
        # shape : (batch * sequence_length, num_classes)
        probs_flat = log_probs_flat.exp()
        # shape : (batch * sequence_length,)
        probs_flat = torch.gather(probs_flat, dim=1, index=targets_flat)
        # shape : (batch * sequence_length,)
        focal_factor = (1.0 - probs_flat) ** gamma
        # shape : (batch, sequence_length)
        focal_factor = focal_factor.view(*targets.size())
        weights = weights * focal_factor

    if alpha is not None:
        # shape : () / (num_classes,)
        if isinstance(alpha, (float, int)):

            # shape : (2,)
            alpha_factor = torch.tensor(
                [1.0 - float(alpha), float(alpha)], dtype=weights.dtype, device=weights.device
            )

        elif isinstance(alpha, (list, numpy.ndarray, torch.Tensor)):

            # shape : (c,)
            alpha_factor = torch.tensor(alpha, dtype=weights.dtype, device=weights.device)

            if not alpha_factor.size():
                # shape : (1,)
                alpha_factor = alpha_factor.view(1)
                # shape : (2,)
                alpha_factor = torch.cat([1 - alpha_factor, alpha_factor])
        else:
            raise TypeError(
                ("alpha must be float, list of float, or torch.FloatTensor, {} provided.").format(
                    type(alpha)
                )
            )
        # shape : (batch, max_len)
        alpha_factor = torch.gather(alpha_factor, dim=0, index=targets_flat.view(-1)).view(
            *targets.size()
        )
        weights = weights * alpha_factor

    if label_smoothing is not None and label_smoothing > 0.0:
        num_classes = logits.size(-1)
        smoothing_value = label_smoothing / num_classes
        # Fill all the correct indices with 1 - smoothing value.
        one_hot_targets = torch.zeros_like(log_probs_flat).scatter_(
            -1, targets_flat, 1.0 - label_smoothing
        )
        smoothed_targets = one_hot_targets + smoothing_value
        negative_log_likelihood_flat = -log_probs_flat * smoothed_targets
        negative_log_likelihood_flat = negative_log_likelihood_flat.sum(-1, keepdim=True)
    else:
        # Contribution to the negative log likelihood only comes from the exact indices
        # of the targets, as the target distributions are one-hot. Here we use torch.gather
        # to extract the indices of the num_classes dimension which contribute to the loss.
        # shape : (batch * sequence_length, 1)
        negative_log_likelihood_flat = -torch.gather(log_probs_flat, dim=1, index=targets_flat)
    # shape : (batch, sequence_length)
    negative_log_likelihood = negative_log_likelihood_flat.view(*targets.size())
    # shape : (batch, sequence_length)
    negative_log_likelihood = negative_log_likelihood * weights

    if average == "batch":
        # shape : (batch_size,)
        per_batch_loss = negative_log_likelihood.sum(non_batch_dims) / (
                weights_batch_sum + tiny_value_of_dtype(negative_log_likelihood.dtype)
        )
        num_non_empty_sequences = (weights_batch_sum > 0).sum() + tiny_value_of_dtype(
            negative_log_likelihood.dtype
        )
        return per_batch_loss.sum() / num_non_empty_sequences
    elif average == "token":
        return negative_log_likelihood.sum() / (
                weights_batch_sum.sum() + tiny_value_of_dtype(negative_log_likelihood.dtype)
        )
    else:
        # shape : (batch_size,)
        per_batch_loss = negative_log_likelihood.sum(non_batch_dims) / (
                weights_batch_sum + tiny_value_of_dtype(negative_log_likelihood.dtype)
        )
        return per_batch_loss


def tiny_value_of_dtype(dtype: torch.dtype):
    """Returns a moderately tiny value for a given PyTorch data type that is used to avoid numerical
    issues such as division by zero.
    This is different from `info_value_of_dtype(dtype).tiny` because it causes some NaN bugs.
    Only supports floating point dtypes.

    Args:
      dtype: torch.dtype: 

    Returns:

    """
    if not dtype.is_floating_point:
        raise TypeError("Only supports floating point dtypes.")
    if dtype == torch.float or dtype == torch.double:
        return 1e-13
    elif dtype == torch.half:
        return 1e-4
    else:
        raise TypeError("Does not support dtype " + str(dtype))


def combine_initial_dims_to_1d_or_2d(tensor: torch.Tensor) -> torch.Tensor:
    """Given a (possibly higher order) tensor of ids with shape
    (d1, ..., dn, sequence_length)

    Args:
      tensor: torch.Tensor: 

    Returns:
      If original tensor is 1-d or 2-d, return it as is.

    """
    if tensor.dim() <= 2:
        return tensor
    else:
        return tensor.view(-1, tensor.size(-1))


def uncombine_initial_dims(tensor: torch.Tensor, original_size: torch.Size) -> torch.Tensor:
    """Given a tensor of embeddings with shape
    (d1 * ... * dn, sequence_length, embedding_dim)
    and the original shape
    (d1, ..., dn, sequence_length),

    Args:
      tensor: torch.Tensor: 
      original_size: torch.Size: 

    Returns:
      (d1, ..., dn, sequence_length, embedding_dim).
      If original size is 1-d or 2-d, return it as is.

    """
    if len(original_size) <= 2:
        return tensor
    else:
        view_args = list(original_size) + [tensor.size(-1)]
        return tensor.view(*view_args)


def get_range_vector(size: int, device: int) -> torch.Tensor:
    """Returns a range vector with the desired size, starting at 0. The CUDA implementation
    is meant to avoid copy data from CPU to GPU.

    Args:
      size: int: 
      device: int: 

    Returns:

    """
    if device > -1:
        return torch.cuda.LongTensor(size, device=device).fill_(1).cumsum(0) - 1
    else:
        return torch.arange(0, size, dtype=torch.long)


def get_device_of(tensor: torch.Tensor) -> int:
    """Returns the device of the tensor.

    Args:
      tensor: torch.Tensor: 

    Returns:

    """
    if not tensor.is_cuda:
        return -1
    else:
        return tensor.get_device()


================================================
FILE: hanlp/components/parsers/ud/util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-14 20:44
from hanlp_common.constant import ROOT
from hanlp.components.parsers.ud.lemma_edit import gen_lemma_rule


def generate_lemma_rule(sample: dict):
    if 'LEMMA' in sample:
        sample['lemma'] = [gen_lemma_rule(word, lemma) if lemma != "_" else "_" for word, lemma in
                           zip(sample['FORM'], sample['LEMMA'])]
    return sample


def append_bos(sample: dict):
    if 'FORM' in sample:
        sample['token'] = [ROOT] + sample['FORM']
    if 'UPOS' in sample:
        sample['pos'] = sample['UPOS'][:1] + sample['UPOS']
        sample['arc'] = [0] + sample['HEAD']
        sample['rel'] = sample['DEPREL'][:1] + sample['DEPREL']
        sample['lemma'] = sample['lemma'][:1] + sample['lemma']
        sample['feat'] = sample['FEATS'][:1] + sample['FEATS']
    return sample


def sample_form_missing(sample: dict):
    return all(t == '_' for t in sample['FORM'])


================================================
FILE: hanlp/components/pipeline.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-31 00:22
import types
from typing import Callable, Union, Iterable, Any
from hanlp.components.lambda_wrapper import LambdaComponent
from hanlp.common.component import Component
from hanlp_common.document import Document
from hanlp.utils.component_util import load_from_meta
from hanlp_common.io import save_json, load_json
from hanlp_common.reflection import str_to_type, classpath_of
import hanlp


class Pipe(Component):

    def __init__(self, component: Component, input_key: str = None, output_key: str = None, **kwargs) -> None:
        super().__init__()
        if not hasattr(self, 'config'):
            self.config = {'classpath': classpath_of(self)}
        self.output_key = output_key
        self.input_key = input_key
        self.component = component
        self.kwargs = kwargs
        self.config.update({
            'component': component.config,
            'input_key': self.input_key,
            'output_key': self.output_key,
            'kwargs': self.kwargs
        })

    # noinspection PyShadowingBuiltins
    def predict(self, doc: Document, **kwargs) -> Document:

        unpack = False
        if self.input_key:
            if isinstance(self.input_key, (tuple, list)):
                if isinstance(self.component, LambdaComponent):  # assume functions take multiple arguments
                    input = [doc[key] for key in self.input_key]
                    unpack = True
                else:
                    input = list(list(zip(*sent)) for sent in zip(*[doc[key] for key in self.input_key]))
            else:
                input = doc[self.input_key]
        else:
            input = doc

        if self.kwargs:
            kwargs.update(self.kwargs)
        if unpack:
            kwargs['_hanlp_unpack'] = True
        output = self.component(input, **kwargs)
        if isinstance(output, types.GeneratorType):
            output = list(output)
        if self.output_key:
            if not isinstance(doc, Document):
                doc = Document()
            if isinstance(self.output_key, tuple):
                for key, value in zip(self.output_key, output):
                    doc[key] = value
            else:
                doc[self.output_key] = output
            return doc
        return output

    def __repr__(self):
        name = self.component.function.__name__ if isinstance(self.component, LambdaComponent) \
            else self.component.__class__.__name__
        return f'{self.input_key}->{name}->{self.output_key}'

    @staticmethod
    def from_config(meta: dict, **kwargs):
        cls = str_to_type(meta['classpath'])
        component = load_from_meta(meta['component'])
        return cls(component, meta['input_key'], meta['output_key'], **meta['kwargs'])


class Pipeline(Component, list):
    def __init__(self, *pipes: Pipe) -> None:
        super().__init__()
        if not hasattr(self, 'config'):
            self.config = {'classpath': classpath_of(self)}
        if pipes:
            self.extend(pipes)

    def append(self, component: Callable, input_key: Union[str, Iterable[str]] = None,
               output_key: Union[str, Iterable[str]] = None, **kwargs):
        """
        Append a pipe to the tail of this pipeline.

        Args:
            component: A callable function.
            input_key: The input key indicating which fields will be inputted to the pipe. ``None``: inherit from
                previous pipe; ``*``: use all the outputs from previous pipes wrapped in a
                :class:`~hanlp_common.document.Document`.
            output_key: The output key indicating where to store the outputs
            **kwargs: Extra arguments passed to the ``Pipe`` constructor.

        Returns:

            Pipeline: A pipeline.
        """
        self.insert(len(self), component, input_key, output_key, **kwargs)
        return self

    def insert(self, index: int, component: Callable, input_key: Union[str, Iterable[str]] = None,
               output_key: Union[str, Iterable[str]] = None,
               **kwargs):
        """

        Args:
            index: The index of the new pipe.
            input_key: The input key indicating which fields will be inputted to the pipe. ``None``: inherit from
                previous pipe; ``*``: use all the outputs from previous pipes wrapped in a
                :class:`~hanlp_common.document.Document`.
            output_key: The output key indicating where to store the outputs
            **kwargs: Extra arguments passed to the ``Pipe`` constructor.

        Returns:

            Pipeline: A pipeline.
        """
        if input_key == '*':
            input_key = None
        elif not input_key and len(self) and index:
            input_key = self[index - 1].output_key
        if not isinstance(component, Component):
            component = LambdaComponent(component)
        super().insert(index, Pipe(component, input_key, output_key, **kwargs))
        return self

    def __call__(self, doc: Union[Document, Any] = None, **kwargs) -> Document:
        """Run the pipeline as a function.

        Args:
            doc: A :class:`~hanlp_common.document.Document` or other data types.
            **kwargs: If `doc` is set to None then create a :class:`~hanlp_common.document.Document` as the
                input to the first pipe using all the parameters in ``kwargs``.

        Returns:
            A :class:`~hanlp_common.document.Document`.
        """
        if doc is None:
            doc = Document(**kwargs)
        for component in self:
            doc = component(doc)
        return doc
    
    def copy(self):
        return self.__copy__()
    
    def __copy__(self):
        config = self.meta
        return Pipeline.from_config(config)
    
    @property
    def meta(self):
        return {
            'classpath': classpath_of(self),
            'hanlp_version': hanlp.version.__version__,
            'pipes': [pipe.config for pipe in self]
        }    
    @meta.setter
    def meta(self, value):
        pass

    def save(self, filepath):
        save_json(self.meta, filepath)

    def load(self, filepath):
        meta = load_json(filepath)
        self.clear()
        self.extend(Pipeline.from_config(meta))

    @staticmethod
    def from_config(meta: Union[dict, str], **kwargs):
        if isinstance(meta, str):
            meta = load_json(meta)
        return Pipeline(*[load_from_meta(pipe) for pipe in meta['pipes']])


================================================
FILE: hanlp/components/rnn_language_model_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-04 17:28
from typing import List, Union

import tensorflow as tf

from hanlp.common.keras_component import KerasComponent
from hanlp.transform.text_tf import TextTransform


class RNNLanguageModel(KerasComponent):

    def __init__(self, transform: TextTransform = None) -> None:
        if not transform:
            transform = TextTransform()
        super().__init__(transform)
        self.transform: TextTransform = transform

    def fit(self, trn_data, dev_data, save_dir,
            forward=True,
            embedding=100,
            rnn_input_dropout=0.1,
            rnn_units: int = 1024,
            rnn_output_dropout=0.1,
            seq_len: int = 250,
            optimizer='sgd',
            learning_rate=20,
            anneal_factor: float = 0.25,
            anneal_patience: int = 10,
            clipnorm=0.25,
            batch_size: int = 100, epochs=1000, run_eagerly=False, logger=None, verbose=True,
            **kwargs):
        return super().fit(**dict((k, v) for k, v in locals().items() if k not in ('self', 'kwargs')))

    def build_model(self, embedding, rnn_input_dropout, rnn_units, rnn_output_dropout, batch_size, seq_len, training,
                    **kwargs) -> tf.keras.Model:
        model = tf.keras.Sequential()
        extra_args = {}
        if training:
            extra_args['batch_input_shape'] = [batch_size, seq_len]
        embedding = tf.keras.layers.Embedding(input_dim=len(self.transform.vocab), output_dim=embedding,
                                              trainable=True, mask_zero=True, **extra_args)
        model.add(embedding)
        if rnn_input_dropout:
            model.add(tf.keras.layers.Dropout(rnn_input_dropout, name='rnn_input_dropout'))
        model.add(tf.keras.layers.LSTM(units=rnn_units, return_sequences=True, stateful=training, name='encoder'))
        if rnn_output_dropout:
            model.add(tf.keras.layers.Dropout(rnn_output_dropout, name='rnn_output_dropout'))
        model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(self.transform.vocab)), name='decoder'))
        return model

    # noinspection PyMethodOverriding
    def build_optimizer(self, optimizer, learning_rate, clipnorm, **kwargs):
        if optimizer == 'sgd':
            optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, clipnorm=clipnorm)
        return super().build_optimizer(optimizer, **kwargs)

    def build_train_dataset(self, trn_data, batch_size):
        trn_data = self.transform.file_to_dataset(trn_data, batch_size=batch_size, shuffle=False, repeat=-1)
        return trn_data

    def build_valid_dataset(self, dev_data, batch_size):
        dev_data = self.transform.file_to_dataset(dev_data, batch_size=batch_size, shuffle=False, drop_remainder=True)
        return dev_data

    def generate_text(self, text: Union[str, List[str]] = '\n', num_steps=50):
        char_mode = False
        if isinstance(text, str):
            text = list(text)
            char_mode = True
        forward = self.config['forward']
        # A slow implementation. Might better to let LSTM return states.
        # But anyway, this interface is for fun so let's take it easy
        for step in range(num_steps):
            output = self.predict(text)
            first_or_last_token = output[-1]
            if forward:
                text += first_or_last_token
            else:
                text = [first_or_last_token] + text
        if char_mode:
            text = ''.join(text)
        return text


================================================
FILE: hanlp/components/srl/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-22 20:50

================================================
FILE: hanlp/components/srl/span_bio/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-04 13:59


================================================
FILE: hanlp/components/srl/span_bio/baffine_tagging.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-04 13:59
import math

import torch
from torch import nn

from hanlp.components.parsers.biaffine.biaffine import Biaffine
from hanlp.components.parsers.biaffine.mlp import MLP
from hanlp.layers.crf.crf import CRF


class BiaffineTaggingDecoder(nn.Module):

    def __init__(self,
                 n_rels,
                 hidden_size,
                 n_mlp_rel=300,
                 mlp_dropout=0.2,
                 crf=False) -> None:
        super().__init__()
        self.mlp_rel_h = MLP(n_in=hidden_size,
                             n_out=n_mlp_rel,
                             dropout=mlp_dropout)
        self.mlp_rel_d = MLP(n_in=hidden_size,
                             n_out=n_mlp_rel,
                             dropout=mlp_dropout)
        self.rel_attn = Biaffine(n_in=n_mlp_rel,
                                 n_out=n_rels,
                                 bias_x=True,
                                 bias_y=True)
        bias = 1 / math.sqrt(self.rel_attn.weight.size(1))
        nn.init.uniform_(self.rel_attn.weight, -bias, bias)
        self.crf = CRF(n_rels) if crf else None

    # noinspection PyUnusedLocal
    def forward(self, x: torch.Tensor, **kwargs):
        rel_h = self.mlp_rel_h(x)
        rel_d = self.mlp_rel_d(x)

        # get arc and rel scores from the bilinear attention
        # [batch_size, seq_len, seq_len, n_rels]
        s_rel = self.rel_attn(rel_d, rel_h).permute(0, 2, 3, 1)
        return s_rel


class SpanBIOSemanticRoleLabelingModel(nn.Module):

    def __init__(self,
                 embed,
                 encoder,
                 num_labels: int,
                 n_mlp_rel,
                 mlp_dropout,
                 crf=False,
                 ) -> None:
        super().__init__()
        self.embed = embed
        self.encoder = encoder
        hidden_size = encoder.get_output_dim() if encoder else embed.get_output_dim()
        self.decoder = BiaffineTaggingDecoder(
            num_labels,
            hidden_size,
            n_mlp_rel,
            mlp_dropout,
            crf,
        )

    def forward(self, batch, mask):
        x = self.embed(batch)
        if self.encoder:
            x = self.encoder(x, mask=mask)
        x = self.decoder(x)
        return x


================================================
FILE: hanlp/components/srl/span_bio/span_bio.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-22 20:54
import logging
from copy import copy
from typing import Union, List, Callable, Dict, Any
from bisect import bisect
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader

from hanlp_common.constant import IDX, PRED
from hanlp.common.dataset import PadSequenceDataLoader, SamplerBuilder, TransformableDataset
from hanlp.common.structure import History
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import FieldLength
from hanlp.common.vocab import Vocab
from hanlp.components.srl.span_bio.baffine_tagging import SpanBIOSemanticRoleLabelingModel
from hanlp.datasets.srl.loaders.conll2012 import CoNLL2012SRLBIODataset
from hanlp.layers.crf.crf import CRF
from hanlp.layers.embeddings.contextual_word_embedding import find_transformer
from hanlp.layers.embeddings.embedding import Embedding
from hanlp.layers.transformers.utils import build_optimizer_scheduler_with_transformer
from hanlp.metrics.chunking.sequence_labeling import get_entities
from hanlp.metrics.f1 import F1
from hanlp.utils.string_util import guess_delimiter
from hanlp.utils.time_util import CountdownTimer
from hanlp.utils.torch_util import clip_grad_norm, lengths_to_mask
from hanlp_common.util import merge_locals_kwargs, reorder


class SpanBIOSemanticRoleLabeler(TorchComponent):

    def __init__(self, **kwargs) -> None:
        """A span based Semantic Role Labeling task using BIO scheme for tagging the role of each token. Given a
        predicate and a token, it uses biaffine (:cite:`dozat:17a`) to predict their relations as one of BIO-ROLE.

        Args:
            **kwargs: Predefined config.
        """
        super().__init__(**kwargs)
        self.model: SpanBIOSemanticRoleLabelingModel = None

    def build_optimizer(self,
                        trn,
                        epochs,
                        lr,
                        adam_epsilon,
                        weight_decay,
                        warmup_steps,
                        transformer_lr=None,
                        gradient_accumulation=1,
                        **kwargs):
        num_training_steps = len(trn) * epochs // gradient_accumulation
        if transformer_lr is None:
            transformer_lr = lr
        transformer = find_transformer(self.model.embed)
        optimizer, scheduler = build_optimizer_scheduler_with_transformer(self.model, transformer,
                                                                          lr, transformer_lr,
                                                                          num_training_steps, warmup_steps,
                                                                          weight_decay, adam_epsilon)
        return optimizer, scheduler

    def build_criterion(self, decoder=None, **kwargs):
        if self.config.crf:
            if not decoder:
                decoder = self.model.decoder
            if isinstance(decoder, torch.nn.DataParallel):
                decoder = decoder.module
            return decoder.crf
        else:
            return nn.CrossEntropyLoss(reduction=self.config.loss_reduction)

    def build_metric(self, **kwargs):
        return F1()

    def execute_training_loop(self,
                              trn: DataLoader,
                              dev: DataLoader,
                              epochs,
                              criterion,
                              optimizer,
                              metric,
                              save_dir,
                              logger: logging.Logger,
                              devices,
                              ratio_width=None,
                              patience=0.5,
                              **kwargs):
        if isinstance(patience, float):
            patience = int(patience * epochs)
        best_epoch, best_metric = 0, -1
        timer = CountdownTimer(epochs)
        history = History()
        for epoch in range(1, epochs + 1):
            logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
            self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history, ratio_width=ratio_width,
                                **self.config)
            loss, dev_metric = self.evaluate_dataloader(dev, criterion, metric, logger=logger, ratio_width=ratio_width)
            timer.update()
            report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}"
            if dev_metric > best_metric:
                best_epoch, best_metric = epoch, copy(dev_metric)
                self.save_weights(save_dir)
                report += ' [red](saved)[/red]'
            else:
                report += f' ({epoch - best_epoch})'
                if epoch - best_epoch >= patience:
                    report += ' early stop'
            logger.info(report)
            if epoch - best_epoch >= patience:
                break
        if not best_epoch:
            self.save_weights(save_dir)
        elif best_epoch != epoch:
            self.load_weights(save_dir)
        logger.info(f"Max score of dev is {best_metric} at epoch {best_epoch}")
        logger.info(f"Average time of each epoch is {timer.elapsed_average_human}")
        logger.info(f"{timer.elapsed_human} elapsed")

    # noinspection PyMethodOverriding
    def fit_dataloader(self,
                       trn: DataLoader,
                       criterion,
                       optimizer,
                       metric,
                       logger: logging.Logger,
                       history: History,
                       gradient_accumulation=1,
                       grad_norm=None,
                       ratio_width=None,
                       eval_trn=False,
                       **kwargs):
        optimizer, scheduler = optimizer
        self.model.train()
        timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation))
        total_loss = 0
        for idx, batch in enumerate(trn):
            pred, mask = self.feed_batch(batch)
            loss = self.compute_loss(criterion, pred, batch['srl_id'], mask)
            if gradient_accumulation and gradient_accumulation > 1:
                loss /= gradient_accumulation
            loss.backward()
            total_loss += loss.item()
            if eval_trn:
                prediction = self.decode_output(pred, mask, batch)
                self.update_metrics(metric, prediction, batch)
            if history.step(gradient_accumulation):
                self._step(optimizer, scheduler, grad_norm)
                report = f'loss: {total_loss / (idx + 1):.4f} {metric}' if eval_trn else f'loss: {total_loss / (idx + 1):.4f}'
                timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width)
            del loss
            del pred
            del mask

    def naive_decode(self, pred, mask, batch, decoder=None):
        vocab = self.vocabs['srl'].idx_to_token
        results = []
        for sent, matrix in zip(batch['token'], pred.argmax(-1).tolist()):
            results.append([])
            for token, tags_per_token in zip(sent, matrix):
                tags_per_token = [vocab[x] for x in tags_per_token][:len(sent)]
                srl_per_token = get_entities(tags_per_token)
                results[-1].append(srl_per_token)
        return results

    def decode_output(self, pred, mask, batch, decoder=None):
        # naive = self.naive_decode(pred, mask, batch, decoder)
        vocab = self.vocabs['srl'].idx_to_token
        if mask is not None:
            if self.config.crf:
                if not decoder:
                    decoder = self.model.decoder
                crf: CRF = decoder.crf
                token_index, mask = mask
                pred = crf.decode(pred, mask)
                pred = sum(pred, [])
            else:
                pred = pred[mask].argmax(-1)
                pred = pred.tolist()
        pred = [vocab[x] for x in pred]
        results = []
        offset = 0
        for sent in batch['token']:
            results.append([])
            for token in sent:
                tags_per_token = pred[offset:offset + len(sent)]
                srl_per_token = get_entities(tags_per_token)
                results[-1].append(srl_per_token)
                offset += len(sent)
        assert offset == len(pred)
        # assert results == naive
        return results

    def update_metrics(self, metric, prediction, batch):
        for p, g in zip(prediction, batch['srl_set']):
            srl = set()
            for i, args in enumerate(p):
                srl.update((i, start, end, label) for (label, start, end) in args)
            metric(srl, g)
        return metric

    def feed_batch(self, batch: dict):
        lens = batch['token_length']
        mask2d = lengths_to_mask(lens)
        pred = self.model(batch, mask=mask2d)
        mask3d = self.compute_mask(mask2d)
        if self.config.crf:
            token_index = mask3d[0]
            pred = pred.flatten(end_dim=1)[token_index]
            pred = F.log_softmax(pred, dim=-1)
        return pred, mask3d

    def compute_mask(self, mask2d):
        mask3d = mask2d.unsqueeze_(-1).expand(-1, -1, mask2d.size(1))
        mask3d = mask3d & mask3d.transpose(1, 2)
        if self.config.crf:
            mask3d = mask3d.flatten(end_dim=1)
            token_index = mask3d[:, 0]
            mask3d = mask3d[token_index]
            return token_index, mask3d
        else:
            return mask3d

    def _step(self, optimizer, scheduler, grad_norm):
        clip_grad_norm(self.model, grad_norm)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    # noinspection PyMethodOverriding
    def build_model(self, embed: Embedding, encoder, training, **kwargs) -> torch.nn.Module:
        # noinspection PyCallByClass
        model = SpanBIOSemanticRoleLabelingModel(
            embed.module(training=training, vocabs=self.vocabs),
            encoder,
            len(self.vocabs.srl),
            self.config.n_mlp_rel,
            self.config.mlp_dropout,
            self.config.crf,
        )
        return model

    # noinspection PyMethodOverriding
    def build_dataloader(self, data, batch_size,
                         sampler_builder: SamplerBuilder = None,
                         gradient_accumulation=1,
                         shuffle=False, device=None, logger: logging.Logger = None,
                         transform=None,
                         **kwargs) -> DataLoader:
        if isinstance(data, TransformableDataset):
            dataset = data
        else:
            transforms = [self.config.embed.transform(vocabs=self.vocabs), self.vocabs, FieldLength('token')]
            if transform:
                transforms.insert(0, transform)
            dataset = self.build_dataset(data, transforms)
        if self.vocabs.mutable:
            # noinspection PyTypeChecker
            self.build_vocabs(dataset, logger)
        lens = [len(x['token_input_ids']) for x in dataset]
        if sampler_builder:
            sampler = sampler_builder.build(lens, shuffle, gradient_accumulation)
        else:
            sampler = None
        return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler)

    def build_dataset(self, data, transform):
        dataset = CoNLL2012SRLBIODataset(data,
                                         transform=transform,
                                         doc_level_offset=self.config.get('doc_level_offset', True),
                                         cache=isinstance(data, str))
        return dataset

    def build_vocabs(self, dataset, logger, **kwargs):
        self.vocabs.srl = Vocab(pad_token=None, unk_token=None)
        timer = CountdownTimer(len(dataset))
        max_seq_len = 0
        for sample in dataset:
            max_seq_len = max(max_seq_len, len(sample['token_input_ids']))
            timer.log(f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})')
        self.vocabs['srl'].set_unk_as_safe_unk()  # C-ARGM-FRQ appears only in test set
        self.vocabs.lock()
        self.vocabs.summary(logger)
        if self.config.get('delimiter') is None:
            tokens = dataset[0]['token']
            self.config.delimiter = guess_delimiter(tokens)
            logger.info(f'Guess the delimiter between tokens could be [blue]"{self.config.delimiter}"[/blue]. '
                        f'If not, specify `delimiter` in `fit()`')

    def predict(self, data: Union[str, List[str]], batch_size: int = None, **kwargs):
        if not data:
            return []
        flat = self.input_is_flat(data)
        if flat:
            data = [data]
        dataloader = self.build_dataloader(self.build_samples(data), batch_size, device=self.device, **kwargs)
        results = []
        order = []
        for batch in dataloader:
            pred, mask = self.feed_batch(batch)
            prediction = self.decode_output(pred, mask, batch)
            results.extend(self.prediction_to_result(prediction, batch))
            order.extend(batch[IDX])
        results = reorder(results, order)
        if flat:
            return results[0]
        return results

    def build_samples(self, data):
        return [{'token': token} for token in data]

    # noinspection PyMethodOverriding
    def fit(self,
            trn_data,
            dev_data,
            save_dir,
            embed,
            encoder=None,
            lr=1e-3,
            transformer_lr=1e-4,
            adam_epsilon=1e-8,
            warmup_steps=0.1,
            weight_decay=0,
            crf=False,
            n_mlp_rel=300,
            mlp_dropout=0.2,
            batch_size=32,
            gradient_accumulation=1,
            grad_norm=1,
            loss_reduction='mean',
            epochs=30,
            delimiter=None,
            doc_level_offset=True,
            eval_trn=False,
            logger=None,
            devices: Union[float, int, List[int]] = None,
            transform=None,
            **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def compute_loss(self, criterion, pred, srl, mask):
        if self.config.crf:
            token_index, mask = mask
            criterion: CRF = criterion
            loss = -criterion.forward(pred, srl.flatten(end_dim=1)[token_index], mask,
                                      reduction=self.config.loss_reduction)
        else:
            loss = criterion(pred[mask], srl[mask])
        return loss

    # noinspection PyMethodOverriding
    @torch.no_grad()
    def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric, logger, ratio_width=None,
                            filename=None, **kwargs):
        self.model.eval()
        timer = CountdownTimer(len(data))
        total_loss = 0
        metric.reset()
        for idx, batch in enumerate(data):
            pred, mask = self.feed_batch(batch)
            loss = self.compute_loss(criterion, pred, batch['srl_id'], mask)
            total_loss += loss.item()
            prediction = self.decode_output(pred, mask, batch)
            self.update_metrics(metric, prediction, batch)
            report = f'loss: {total_loss / (idx + 1):.4f} {metric}'
            timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width)
        return total_loss / timer.total, metric

    def input_is_flat(self, data) -> bool:
        return isinstance(data[0], str)

    def prediction_to_result(self, prediction: List, batch: Dict[str, Any], delimiter=None) -> List:
        if delimiter is None:
            delimiter = self.config.delimiter
        for matrix, tokens in zip(prediction, batch['token']):
            result = []
            for i, arguments in enumerate(matrix):
                if arguments:
                    pas = [(delimiter.join(tokens[x[1]:x[2]]),) + x for x in arguments]
                    pas.insert(bisect([a[1] for a in arguments], i), (tokens[i], PRED, i, i + 1))
                    result.append(pas)
            yield result


================================================
FILE: hanlp/components/srl/span_rank/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-19 22:22

================================================
FILE: hanlp/components/srl/span_rank/highway_variational_lstm.py
================================================
# Adopted from https://github.com/KiroSummer/A_Syntax-aware_MTL_Framework_for_Chinese_SRL
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
from torch.autograd import Variable

from .layer import DropoutLayer, HighwayLSTMCell, VariationalLSTMCell


def initializer_1d(input_tensor, initializer):
    assert len(input_tensor.size()) == 1
    input_tensor = input_tensor.view(-1, 1)
    input_tensor = initializer(input_tensor)
    return input_tensor.view(-1)


class HighwayBiLSTM(nn.Module):
    """A module that runs multiple steps of HighwayBiLSTM."""

    def __init__(self, input_size, hidden_size, num_layers=1, batch_first=False, bidirectional=False, dropout_in=0,
                 dropout_out=0):
        super(HighwayBiLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_first = batch_first
        self.bidirectional = bidirectional
        self.dropout_in = dropout_in
        self.dropout_out = dropout_out
        self.num_directions = 2 if bidirectional else 1

        self.fcells, self.f_dropout, self.f_hidden_dropout = [], [], []
        self.bcells, self.b_dropout, self.b_hidden_dropout = [], [], []
        for layer in range(num_layers):
            layer_input_size = input_size if layer == 0 else hidden_size
            self.fcells.append(HighwayLSTMCell(input_size=layer_input_size, hidden_size=hidden_size))
            self.f_dropout.append(DropoutLayer(hidden_size, self.dropout_out))
            self.f_hidden_dropout.append(DropoutLayer(hidden_size, self.dropout_out))
            if self.bidirectional:
                self.bcells.append(HighwayLSTMCell(input_size=hidden_size, hidden_size=hidden_size))
                self.b_dropout.append(DropoutLayer(hidden_size, self.dropout_out))
                self.b_hidden_dropout.append(DropoutLayer(hidden_size, self.dropout_out))
        self.fcells, self.bcells = nn.ModuleList(self.fcells), nn.ModuleList(self.bcells)
        self.f_dropout, self.b_dropout = nn.ModuleList(self.f_dropout), nn.ModuleList(self.b_dropout)

    def reset_dropout_layer(self, batch_size):
        for layer in range(self.num_layers):
            self.f_dropout[layer].reset_dropout_mask(batch_size)
            if self.bidirectional:
                self.b_dropout[layer].reset_dropout_mask(batch_size)

    @staticmethod
    def _forward_rnn(cell, gate, input, masks, initial, drop_masks=None, hidden_drop=None):
        max_time = input.size(0)
        output = []
        hx = initial
        for time in range(max_time):
            h_next, c_next = cell(input[time], mask=masks[time], hx=hx, dropout=drop_masks)
            hx = (h_next, c_next)
            output.append(h_next)
        output = torch.stack(output, 0)
        return output, hx

    @staticmethod
    def _forward_brnn(cell, gate, input, masks, initial, drop_masks=None, hidden_drop=None):
        max_time = input.size(0)
        output = []
        hx = initial
        for time in reversed(list(range(max_time))):
            h_next, c_next = cell(input[time], mask=masks[time], hx=hx, dropout=drop_masks)
            hx = (h_next, c_next)
            output.append(h_next)
        output.reverse()
        output = torch.stack(output, 0)
        return output, hx

    def forward(self, input, masks, initial=None):
        if self.batch_first:
            input = input.transpose(0, 1)  # transpose: return the transpose matrix
            masks = torch.unsqueeze(masks.transpose(0, 1), dim=2)
        max_time, batch_size, _ = input.size()

        self.reset_dropout_layer(batch_size)  # reset the dropout each batch forward

        masks = masks.expand(-1, -1, self.hidden_size)  # expand: -1 means not expand that dimension
        if initial is None:
            initial = Variable(input.data.new(batch_size, self.hidden_size).zero_())
            initial = (initial, initial)  # h0, c0

        h_n, c_n = [], []
        for layer in range(self.num_layers):
            # hidden_mask, hidden_drop = None, None
            hidden_mask, hidden_drop = self.f_dropout[layer], self.f_hidden_dropout[layer]
            layer_output, (layer_h_n, layer_c_n) = HighwayBiLSTM._forward_rnn(cell=self.fcells[layer], \
                                                                              gate=None, input=input, masks=masks,
                                                                              initial=initial, \
                                                                              drop_masks=hidden_mask,
                                                                              hidden_drop=hidden_drop)
            h_n.append(layer_h_n)
            c_n.append(layer_c_n)
            if self.bidirectional:
                hidden_mask, hidden_drop = self.b_dropout[layer], self.b_hidden_dropout[layer]
                blayer_output, (blayer_h_n, blayer_c_n) = HighwayBiLSTM._forward_brnn(cell=self.bcells[layer], \
                                                                                      gate=None, input=layer_output,
                                                                                      masks=masks, initial=initial, \
                                                                                      drop_masks=hidden_mask,
                                                                                      hidden_drop=hidden_drop)
                h_n.append(blayer_h_n)
                c_n.append(blayer_c_n)

            input = blayer_output if self.bidirectional else layer_output

        h_n, c_n = torch.stack(h_n, 0), torch.stack(c_n, 0)
        if self.batch_first:
            input = input.transpose(1, 0)  # transpose: return the transpose matrix
        return input, (h_n, c_n)


class StackedHighwayBiLSTM(nn.Module):
    """A module that runs multiple steps of HighwayBiLSTM."""

    def __init__(self, input_size, hidden_size, num_layers=1, batch_first=False, \
                 bidirectional=False, dropout_in=0, dropout_out=0):
        super(StackedHighwayBiLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_first = batch_first
        self.bidirectional = bidirectional
        self.dropout_in = dropout_in
        self.dropout_out = dropout_out
        self.num_directions = 2 if bidirectional else 1

        self.fcells, self.f_dropout, self.f_hidden_dropout = [], [], []
        self.bcells, self.b_dropout, self.b_hidden_dropout = [], [], []
        self.f_initial, self.b_initial = [], []
        for layer in range(num_layers):
            layer_input_size = input_size if layer == 0 else 2 * hidden_size if self.bidirectional else hidden_size
            self.fcells.append(VariationalLSTMCell(input_size=layer_input_size, hidden_size=hidden_size))
            self.f_dropout.append(DropoutLayer(hidden_size, self.dropout_out))
            self.f_hidden_dropout.append(DropoutLayer(hidden_size, self.dropout_out))
            self.f_initial.append(nn.Parameter(torch.Tensor(2, self.hidden_size)))
            assert self.bidirectional is True
            self.bcells.append(VariationalLSTMCell(input_size=layer_input_size, hidden_size=hidden_size))
            self.b_dropout.append(DropoutLayer(hidden_size, self.dropout_out))
            self.b_hidden_dropout.append(DropoutLayer(hidden_size, self.dropout_out))
            self.b_initial.append(nn.Parameter(torch.Tensor(2, self.hidden_size)))
        self.lstm_project_layer = nn.ModuleList([nn.Linear(2 * self.hidden_size, 2 * self.hidden_size)
                                                 for _ in range(num_layers - 1)])
        self.fcells, self.bcells = nn.ModuleList(self.fcells), nn.ModuleList(self.bcells)
        self.f_dropout, self.b_dropout = nn.ModuleList(self.f_dropout), nn.ModuleList(self.b_dropout)
        self.f_hidden_dropout, self.b_hidden_dropout = \
            nn.ModuleList(self.f_hidden_dropout), nn.ModuleList(self.b_hidden_dropout)
        self.f_initial, self.b_initial = nn.ParameterList(self.f_initial), nn.ParameterList(self.b_initial)
        self.reset_parameters()

    def reset_parameters(self):
        for layer_initial in [self.f_initial, self.b_initial]:
            for initial in layer_initial:
                init.xavier_uniform_(initial)
        for layer in self.lstm_project_layer:
            init.xavier_uniform_(layer.weight)
            initializer_1d(layer.bias, init.xavier_uniform_)

    def reset_dropout_layer(self, batch_size):
        for layer in range(self.num_layers):
            self.f_dropout[layer].reset_dropout_mask(batch_size)
            self.f_hidden_dropout[layer].reset_dropout_mask(batch_size)
            if self.bidirectional:
                self.b_dropout[layer].reset_dropout_mask(batch_size)
                self.b_hidden_dropout[layer].reset_dropout_mask(batch_size)

    def reset_state(self, batch_size):
        f_states, b_states = [], []
        for f_layer_initial, b_layer_initial in zip(self.f_initial, self.b_initial):
            f_states.append([f_layer_initial[0].expand(batch_size, -1), f_layer_initial[1].expand(batch_size, -1)])
            b_states.append([b_layer_initial[0].expand(batch_size, -1), b_layer_initial[1].expand(batch_size, -1)])
        return f_states, b_states

    @staticmethod
    def _forward_rnn(cell, gate, input, masks, initial, drop_masks=None, hidden_drop=None):
        max_time = input.size(0)
        output = []
        hx = initial
        for time in range(max_time):
            h_next, c_next = cell(input[time], mask=masks[time], hx=hx, dropout=drop_masks)
            hx = (h_next, c_next)
            output.append(h_next)
        output = torch.stack(output, 0)
        return output, hx

    @staticmethod
    def _forward_brnn(cell, gate, input, masks, initial, drop_masks=None, hidden_drop=None):
        max_time = input.size(0)
        output = []
        hx = initial
        for time in reversed(list(range(max_time))):
            h_next, c_next = cell(input[time], mask=masks[time], hx=hx, dropout=drop_masks)
            hx = (h_next, c_next)
            output.append(h_next)
        output.reverse()
        output = torch.stack(output, 0)
        return output, hx

    def forward(self, input, masks, initial=None):
        if self.batch_first:
            input = input.transpose(0, 1)  # transpose: return the transpose matrix
            masks = torch.unsqueeze(masks.transpose(0, 1), dim=2)
        max_time, batch_size, _ = input.size()

        self.reset_dropout_layer(batch_size)  # reset the dropout each batch forward
        f_states, b_states = self.reset_state(batch_size)

        masks = masks.expand(-1, -1, self.hidden_size)  # expand: -1 means not expand that dimension

        h_n, c_n = [], []
        outputs = []
        for layer in range(self.num_layers):
            hidden_mask, hidden_drop = self.f_dropout[layer], self.f_hidden_dropout[layer]
            layer_output, (layer_h_n, layer_c_n) = \
                StackedHighwayBiLSTM._forward_rnn(cell=self.fcells[layer],
                                                  gate=None, input=input, masks=masks, initial=f_states[layer],
                                                  drop_masks=hidden_mask, hidden_drop=hidden_drop)
            h_n.append(layer_h_n)
            c_n.append(layer_c_n)
            assert self.bidirectional is True
            hidden_mask, hidden_drop = self.b_dropout[layer], self.b_hidden_dropout[layer]
            blayer_output, (blayer_h_n, blayer_c_n) = \
                StackedHighwayBiLSTM._forward_brnn(cell=self.bcells[layer],
                                                   gate=None, input=input, masks=masks, initial=b_states[layer],
                                                   drop_masks=hidden_mask, hidden_drop=hidden_drop)
            h_n.append(blayer_h_n)
            c_n.append(blayer_c_n)

            output = torch.cat([layer_output, blayer_output], 2) if self.bidirectional else layer_output
            output = F.dropout(output, self.dropout_out, self.training)
            if layer > 0:  # Highway
                highway_gates = torch.sigmoid(self.lstm_project_layer[layer - 1].forward(output))
                output = highway_gates * output + (1 - highway_gates) * input
            if self.batch_first:
                outputs.append(output.transpose(1, 0))
            else:
                outputs.append(output)
            input = output

        h_n, c_n = torch.stack(h_n, 0), torch.stack(c_n, 0)
        if self.batch_first:
            output = output.transpose(1, 0)  # transpose: return the transpose matrix
        return output, (h_n, c_n), outputs


================================================
FILE: hanlp/components/srl/span_rank/inference_utils.py
================================================
# Adopted from https://github.com/KiroSummer/A_Syntax-aware_MTL_Framework_for_Chinese_SRL

# Inference functions for the SRL model.
import numpy as np


def decode_spans(span_starts, span_ends, span_scores, labels_inv):
    """

    Args:
      span_starts: [num_candidates,]
      span_scores: [num_candidates, num_labels]
      span_ends: 
      labels_inv: 

    Returns:

    
    """
    pred_spans = []
    span_labels = np.argmax(span_scores, axis=1)  # [num_candidates]
    spans_list = list(zip(span_starts, span_ends, span_labels, span_scores))
    spans_list = sorted(spans_list, key=lambda x: x[3][x[2]], reverse=True)
    predicted_spans = {}
    for start, end, label, _ in spans_list:
        # Skip invalid span.
        if label == 0 or (start, end) in predicted_spans:
            continue
        pred_spans.append((start, end, labels_inv[label]))
        predicted_spans[(start, end)] = label
    return pred_spans


def greedy_decode(predict_dict, srl_labels_inv):
    """Greedy decoding for SRL predicate-argument structures.

    Args:
      predict_dict: Dictionary of name to numpy arrays.
      srl_labels_inv: SRL label id to string name.
      suppress_overlap: Whether to greedily suppress overlapping arguments for the same predicate.

    Returns:

    
    """
    arg_starts = predict_dict["arg_starts"]
    arg_ends = predict_dict["arg_ends"]
    predicates = predict_dict["predicates"]
    arg_labels = predict_dict["arg_labels"]
    scores = predict_dict["srl_scores"]

    num_suppressed_args = 0

    # Map from predicates to a list of labeled spans.
    pred_to_args = {}
    if len(arg_ends) > 0 and len(predicates) > 0:
        max_len = max(np.max(arg_ends), np.max(predicates)) + 1
    else:
        max_len = 1

    for j, pred_id in enumerate(predicates):
        args_list = []
        for i, (arg_start, arg_end) in enumerate(zip(arg_starts, arg_ends)):
            # If label is not null.
            if arg_labels[i][j] == 0:
                continue
            label = srl_labels_inv[arg_labels[i][j]]
            # if label not in ["V", "C-V"]:
            args_list.append((arg_start, arg_end, label, scores[i][j][arg_labels[i][j]]))

        # Sort arguments by highest score first.
        args_list = sorted(args_list, key=lambda x: x[3], reverse=True)
        new_args_list = []

        flags = [False for _ in range(max_len)]
        # Predicate will not overlap with arguments either.
        flags[pred_id] = True

        for (arg_start, arg_end, label, score) in args_list:
            # If none of the tokens has been covered:
            if not max(flags[arg_start:arg_end + 1]):
                new_args_list.append((arg_start, arg_end, label))
                for k in range(arg_start, arg_end + 1):
                    flags[k] = True

        # Only add predicate if it has any argument.
        if new_args_list:
            pred_to_args[pred_id] = new_args_list

        num_suppressed_args += len(args_list) - len(new_args_list)

    return pred_to_args, num_suppressed_args


_CORE_ARGS = {"ARG0": 1, "ARG1": 2, "ARG2": 4, "ARG3": 8, "ARG4": 16, "ARG5": 32, "ARGA": 64,
              "A0": 1, "A1": 2, "A2": 4, "A3": 8, "A4": 16, "A5": 32, "AA": 64}


def get_predicted_clusters(top_span_starts, top_span_ends, predicted_antecedents):
    mention_to_predicted = {}
    predicted_clusters = []
    for i, predicted_index in enumerate(predicted_antecedents):
        if predicted_index < 0:
            continue
        assert i > predicted_index
        predicted_antecedent = (int(top_span_starts[predicted_index]), int(top_span_ends[predicted_index]))
        if predicted_antecedent in mention_to_predicted:
            predicted_cluster = mention_to_predicted[predicted_antecedent]
        else:
            predicted_cluster = len(predicted_clusters)
            predicted_clusters.append([predicted_antecedent])
            mention_to_predicted[predicted_antecedent] = predicted_cluster

        mention = (int(top_span_starts[i]), int(top_span_ends[i]))
        predicted_clusters[predicted_cluster].append(mention)
        mention_to_predicted[mention] = predicted_cluster

    predicted_clusters = [tuple(pc) for pc in predicted_clusters]
    mention_to_predicted = {m: predicted_clusters[i] for m, i in list(mention_to_predicted.items())}

    return predicted_clusters, mention_to_predicted


def _decode_non_overlapping_spans(starts, ends, scores, max_len, labels_inv, pred_id):
    labels = np.argmax(scores, axis=1)
    spans = []
    for i, (start, end, label) in enumerate(zip(starts, ends, labels)):
        if label <= 0:
            continue
        label_str = labels_inv[label]
        if pred_id is not None and label_str == "V":
            continue
        spans.append((start, end, label_str, scores[i][label]))
    spans = sorted(spans, key=lambda x: x[3], reverse=True)
    flags = np.zeros([max_len], dtype=bool)
    if pred_id is not None:
        flags[pred_id] = True
    new_spans = []
    for start, end, label_str, score in spans:
        if not max(flags[start:end + 1]):
            new_spans.append((start, end, label_str))  # , score))
            for k in range(start, end + 1):
                flags[k] = True
    return new_spans


def _dp_decode_non_overlapping_spans(starts, ends, scores, max_len, labels_inv, pred_id, u_constraint=False):
    num_roles = scores.shape[1]  # [num_arg, num_roles]
    labels = np.argmax(scores, axis=1).astype(np.int64)
    spans = list(zip(starts, ends, list(range(len(starts)))))
    spans = sorted(spans, key=lambda x: (x[0], x[1]))  # sort according to the span start index

    if u_constraint:
        f = np.zeros([max_len + 1, 128], dtype=float) - 0.1
    else:  # This one
        f = np.zeros([max_len + 1, 1], dtype=float) - 0.1

    f[0, 0] = 0
    states = {0: set([0])}  # A dictionary from id to list of binary core-arg states.
    pointers = {}  # A dictionary from states to (arg_id, role, prev_t, prev_rs)
    best_state = [(0, 0)]

    def _update_state(t0, rs0, t1, rs1, delta, arg_id, role):
        if f[t0][rs0] + delta > f[t1][rs1]:
            f[t1][rs1] = f[t0][rs0] + delta
            if t1 not in states:
                states[t1] = set()
            states[t1].update([rs1])
            pointers[(t1, rs1)] = (arg_id, role, t0, rs0)  # the pointers store
            if f[t1][rs1] > f[best_state[0][0]][best_state[0][1]]:
                best_state[0] = (t1, rs1)

    for start, end, i in spans:  # [arg_start, arg_end, arg_span_id]
        assert scores[i][0] == 0  # dummy score
        # The extra dummy score should be same for all states, so we can safely skip arguments overlap
        # with the predicate.
        if pred_id is not None and start <= pred_id and pred_id <= end:  # skip the span contains the predicate
            continue
        r0 = labels[i]  # Locally best role assignment.
        # Strictly better to incorporate a dummy span if it has the highest local score.
        if r0 == 0:  # labels_inv[r0] == "O"
            continue
        r0_str = labels_inv[r0]
        # Enumerate explored states.
        t_states = [t for t in list(states.keys()) if t <= start]  # collect the state which is before the current span
        for t in t_states:  # for each state
            role_states = states[t]
            # Update states if best role is not a core arg.
            if not u_constraint or r0_str not in _CORE_ARGS:  # True; this one
                for rs in role_states:  # the set type in the value in the state dict
                    _update_state(t, rs, end + 1, rs, scores[i][r0], i, r0)  # update the state
            else:
                for rs in role_states:
                    for r in range(1, num_roles):
                        if scores[i][r] > 0:
                            r_str = labels_inv[r]
                            core_state = _CORE_ARGS.get(r_str, 0)
                            # print start, end, i, r_str, core_state, rs
                            if core_state & rs == 0:
                                _update_state(t, rs, end + 1, rs | core_state, scores[i][r], i, r)
    # Backtrack to decode.
    new_spans = []
    t, rs = best_state[0]
    while (t, rs) in pointers:
        i, r, t0, rs0 = pointers[(t, rs)]
        new_spans.append((int(starts[i]), int(ends[i]), labels_inv[r]))
        t = t0
        rs = rs0
    return new_spans[::-1]


def srl_decode(sentence_lengths, predict_dict, srl_labels_inv, config):  # decode the predictions.
    # Decode sentence-level tasks.
    num_sentences = len(sentence_lengths)
    predictions = [{} for _ in range(num_sentences)]
    # Sentence-level predictions.
    for i in range(num_sentences):  # for each sentences
        # if predict_dict["No_arg"] is True:
        #     predictions["srl"][i][predict_dict["predicates"][i]] = []
        #     continue
        predict_dict_num_args_ = predict_dict["num_args"].cpu().numpy()
        predict_dict_num_preds_ = predict_dict["num_preds"].cpu().numpy()
        predict_dict_predicates_ = predict_dict["predicates"].cpu().numpy()
        predict_dict_arg_starts_ = predict_dict["arg_starts"].cpu().numpy()
        predict_dict_arg_ends_ = predict_dict["arg_ends"].cpu().numpy()
        predict_dict_srl_scores_ = predict_dict["srl_scores"].detach().cpu().numpy()
        num_args = predict_dict_num_args_[i]  # the number of the candidate argument spans
        num_preds = predict_dict_num_preds_[i]  # the number of the candidate predicates
        # for each predicate id, exec the decode process
        for j, pred_id in enumerate(predict_dict_predicates_[i][:num_preds]):
            # sorted arg_starts and arg_ends and srl_scores ? should be??? enforce_srl_constraint = False
            arg_spans = _dp_decode_non_overlapping_spans(
                predict_dict_arg_starts_[i][:num_args],
                predict_dict_arg_ends_[i][:num_args],
                predict_dict_srl_scores_[i, :num_args, j, :],
                sentence_lengths[i], srl_labels_inv, pred_id, config.enforce_srl_constraint)
            # To avoid warnings in the eval script.
            if config.use_gold_predicates:  # false
                arg_spans.append((pred_id, pred_id, "V"))
            if arg_spans:
                predictions[i][int(pred_id)] = sorted(arg_spans, key=lambda x: (x[0], x[1]))

    return predictions


================================================
FILE: hanlp/components/srl/span_rank/layer.py
================================================
# Adopted from https://github.com/KiroSummer/A_Syntax-aware_MTL_Framework_for_Chinese_SRL

import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import torch.nn.functional as F

from hanlp.components.srl.span_rank.util import block_orth_normal_initializer


def get_tensor_np(t):
    return t.data.cpu().numpy()


def orthonormal_initializer(output_size, input_size):
    """adopted from Timothy Dozat https://github.com/tdozat/Parser/blob/master/lib/linalg.py

    Args:
      output_size: 
      input_size: 

    Returns:

    
    """
    print((output_size, input_size))
    I = np.eye(output_size)
    lr = .1
    eps = .05 / (output_size + input_size)
    success = False
    tries = 0
    while not success and tries < 10:
        Q = np.random.randn(input_size, output_size) / np.sqrt(output_size)
        for i in range(100):
            QTQmI = Q.T.dot(Q) - I
            loss = np.sum(QTQmI ** 2 / 2)
            Q2 = Q ** 2
            Q -= lr * Q.dot(QTQmI) / (
                    np.abs(Q2 + Q2.sum(axis=0, keepdims=True) + Q2.sum(axis=1, keepdims=True) - 1) + eps)
            if np.max(Q) > 1e6 or loss > 1e6 or not np.isfinite(loss):
                tries += 1
                lr /= 2
                break
        success = True
    if success:
        print(('Orthogonal pretrainer loss: %.2e' % loss))
    else:
        print('Orthogonal pretrainer failed, using non-orthogonal random matrix')
        Q = np.random.randn(input_size, output_size) / np.sqrt(output_size)
    return np.transpose(Q.astype(np.float32))


class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-8):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(features))
        self.beta = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta


class DropoutLayer3D(nn.Module):
    def __init__(self, input_size, dropout_rate=0.0):
        super(DropoutLayer3D, self).__init__()
        self.dropout_rate = dropout_rate
        self.input_size = input_size
        self.drop_mask = torch.FloatTensor(self.input_size).fill_(1 - self.dropout_rate)
        self.drop_mask = Variable(torch.bernoulli(self.drop_mask), requires_grad=False)
        if torch.cuda.is_available():
            self.drop_mask = self.drop_mask.cuda()

    def reset_dropout_mask(self, batch_size, length):
        self.drop_mask = torch.FloatTensor(batch_size, length, self.input_size).fill_(1 - self.dropout_rate)
        self.drop_mask = Variable(torch.bernoulli(self.drop_mask), requires_grad=False)
        if torch.cuda.is_available():
            self.drop_mask = self.drop_mask.cuda()

    def forward(self, x):
        if self.training:
            return torch.mul(x, self.drop_mask)
        else:  # eval
            return x * (1.0 - self.dropout_rate)


class DropoutLayer(nn.Module):
    def __init__(self, input_size, dropout_rate=0.0):
        super(DropoutLayer, self).__init__()
        self.dropout_rate = dropout_rate
        self.input_size = input_size
        self.drop_mask = torch.Tensor(self.input_size).fill_(1 - self.dropout_rate)
        self.drop_mask = torch.bernoulli(self.drop_mask)

    def reset_dropout_mask(self, batch_size):
        self.drop_mask = torch.Tensor(batch_size, self.input_size).fill_(1 - self.dropout_rate)
        self.drop_mask = torch.bernoulli(self.drop_mask)

    def forward(self, x):
        if self.training:
            return torch.mul(x, self.drop_mask.to(x.device))
        else:  # eval
            return x * (1.0 - self.dropout_rate)


class NonLinear(nn.Module):
    def __init__(self, input_size, hidden_size, activation=None):
        super(NonLinear, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.linear = nn.Linear(in_features=input_size, out_features=hidden_size)
        if activation is None:
            self._activate = lambda x: x
        else:
            if not callable(activation):
                raise ValueError("activation must be callable: type={}".format(type(activation)))
            self._activate = activation

        self.reset_parameters()

    def forward(self, x):
        y = self.linear(x)
        return self._activate(y)

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.linear.weight)
        nn.init.zeros_(self.linear.bias)


class Biaffine(nn.Module):
    def __init__(self, in1_features, in2_features, out_features,
                 bias=(True, True)):
        super(Biaffine, self).__init__()
        self.in1_features = in1_features
        self.in2_features = in2_features
        self.out_features = out_features
        self.bias = bias
        self.linear_input_size = in1_features + int(bias[0])
        self.linear_output_size = out_features * (in2_features + int(bias[1]))
        self.linear = nn.Linear(in_features=self.linear_input_size,
                                out_features=self.linear_output_size,
                                bias=False)

        self.reset_parameters()

    def reset_parameters(self):
        torch.nn.init.xavier_uniform_(self.linear.weight)

    def forward(self, input1, input2):
        batch_size, len1, dim1 = input1.size()
        batch_size, len2, dim2 = input2.size()
        if self.bias[0]:
            ones = input1.data.new(batch_size, len1, 1).zero_().fill_(1)  # this kind of implementation is too tedious
            input1 = torch.cat((input1, Variable(ones)), dim=2)
            dim1 += 1
        if self.bias[1]:
            ones = input2.data.new(batch_size, len2, 1).zero_().fill_(1)
            input2 = torch.cat((input2, Variable(ones)), dim=2)
            dim2 += 1

        affine = self.linear(input1)

        affine = affine.view(batch_size, len1 * self.out_features, dim2)
        input2 = torch.transpose(input2, 1, 2)
        # torch.bmm: Performs a batch matrix-matrix product of matrices stored in batch1 and batch2.
        biaffine = torch.transpose(torch.bmm(affine, input2), 1, 2)
        # view: Returns a new tensor with the same data as the self tensor but of a different size.
        biaffine = biaffine.contiguous().view(batch_size, len2, len1, self.out_features)

        return biaffine

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + 'in1_features=' + str(self.in1_features) \
               + ', in2_features=' + str(self.in2_features) \
               + ', out_features=' + str(self.out_features) + ')'


class HighwayLSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(HighwayLSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.linear_ih = nn.Linear(in_features=input_size,
                                   out_features=6 * hidden_size)
        self.linear_hh = nn.Linear(in_features=hidden_size,
                                   out_features=5 * hidden_size,
                                   bias=False)
        self.reset_parameters()  # reset all the param in the MyLSTMCell

    def reset_parameters(self):
        weight_ih = block_orth_normal_initializer([self.input_size, ], [self.hidden_size] * 6)
        self.linear_ih.weight.data.copy_(weight_ih)

        weight_hh = block_orth_normal_initializer([self.hidden_size, ], [self.hidden_size] * 5)
        self.linear_hh.weight.data.copy_(weight_hh)
        # nn.init.constant(self.linear_hh.weight, 1.0)
        # nn.init.constant(self.linear_ih.weight, 1.0)

        nn.init.constant(self.linear_ih.bias, 0.0)

    def forward(self, x, mask=None, hx=None, dropout=None):
        assert mask is not None and hx is not None
        _h, _c = hx
        _x = self.linear_ih(x)  # compute the x
        preact = self.linear_hh(_h) + _x[:, :self.hidden_size * 5]

        i, f, o, t, j = preact.chunk(chunks=5, dim=1)
        i, f, o, t, j = F.sigmoid(i), F.sigmoid(f + 1.0), F.sigmoid(o), F.sigmoid(t), F.tanh(j)
        k = _x[:, self.hidden_size * 5:]

        c = f * _c + i * j
        c = mask * c + (1.0 - mask) * _c

        h = t * o * F.tanh(c) + (1.0 - t) * k
        if dropout is not None:
            h = dropout(h)
        h = mask * h + (1.0 - mask) * _h
        return h, c


class VariationalLSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(VariationalLSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.linear = nn.Linear(in_features=input_size + self.hidden_size, out_features=3 * hidden_size)
        self.reset_parameters()  # reset all the param in the MyLSTMCell

    def reset_parameters(self):
        weight = block_orth_normal_initializer([self.input_size + self.hidden_size, ], [self.hidden_size] * 3)
        self.linear.weight.data.copy_(weight)
        nn.init.constant_(self.linear.bias, 0.0)

    def forward(self, x, mask=None, hx=None, dropout=None):
        assert mask is not None and hx is not None
        _h, _c = hx
        _h = dropout(_h)
        _x = self.linear(torch.cat([x, _h], 1))  # compute the x
        i, j, o = _x.chunk(3, dim=1)
        i = torch.sigmoid(i)
        c = (1.0 - i) * _c + i * torch.tanh(j)
        c = mask * c  # + (1.0 - mask) * _c
        h = torch.tanh(c) * torch.sigmoid(o)
        h = mask * h  # + (1.0 - mask) * _h

        return h, c


class VariationalLSTM(nn.Module):
    """A module that runs multiple steps of LSTM."""

    def __init__(self, input_size, hidden_size, num_layers=1, batch_first=False, \
                 bidirectional=False, dropout_in=0, dropout_out=0):
        super(VariationalLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_first = batch_first
        self.bidirectional = bidirectional
        self.dropout_in = dropout_in
        self.dropout_out = dropout_out
        self.num_directions = 2 if bidirectional else 1

        self.fcells = []
        self.bcells = []
        for layer in range(num_layers):
            layer_input_size = input_size if layer == 0 else hidden_size * self.num_directions
            self.fcells.append(nn.LSTMCell(input_size=layer_input_size, hidden_size=hidden_size))
            if self.bidirectional:
                self.bcells.append(nn.LSTMCell(input_size=layer_input_size, hidden_size=hidden_size))

        self._all_weights = []
        for layer in range(num_layers):
            layer_params = (self.fcells[layer].weight_ih, self.fcells[layer].weight_hh, \
                            self.fcells[layer].bias_ih, self.fcells[layer].bias_hh)
            suffix = ''
            param_names = ['weight_ih_l{}{}', 'weight_hh_l{}{}']
            param_names += ['bias_ih_l{}{}', 'bias_hh_l{}{}']
            param_names = [x.format(layer, suffix) for x in param_names]
            for name, param in zip(param_names, layer_params):
                setattr(self, name, param)
            self._all_weights.append(param_names)

            if self.bidirectional:
                layer_params = (self.bcells[layer].weight_ih, self.bcells[layer].weight_hh, \
                                self.bcells[layer].bias_ih, self.bcells[layer].bias_hh)
                suffix = '_reverse'
                param_names = ['weight_ih_l{}{}', 'weight_hh_l{}{}']
                param_names += ['bias_ih_l{}{}', 'bias_hh_l{}{}']
                param_names = [x.format(layer, suffix) for x in param_names]
                for name, param in zip(param_names, layer_params):
                    setattr(self, name, param)
                self._all_weights.append(param_names)

        self.reset_parameters()

    def reset_parameters(self):  # modified by kiro
        for name, param in self.named_parameters():
            print(name)
            if "weight" in name:
                # for i in range(4):
                # nn.init.orthogonal(self.__getattr__(name)[self.hidden_size*i:self.hidden_size*(i+1),:])
                nn.init.orthogonal(self.__getattr__(name))
            if "bias" in name:
                nn.init.normal(self.__getattr__(name), 0.0, 0.01)
                # nn.init.constant(self.__getattr__(name), 1.0)  # different from zhang's 0

    @staticmethod
    def _forward_rnn(cell, input, masks, initial, drop_masks):
        max_time = input.size(0)
        output = []
        hx = initial
        for time in range(max_time):
            h_next, c_next = cell(input=input[time], hx=hx)
            h_next = h_next * masks[time] + initial[0] * (1 - masks[time])
            c_next = c_next * masks[time] + initial[1] * (1 - masks[time])
            output.append(h_next)
            if drop_masks is not None: h_next = h_next * drop_masks
            hx = (h_next, c_next)
        output = torch.stack(output, 0)
        return output, hx

    @staticmethod
    def _forward_brnn(cell, input, masks, initial, drop_masks):
        max_time = input.size(0)
        output = []
        hx = initial
        for time in reversed(list(range(max_time))):
            h_next, c_next = cell(input=input[time], hx=hx)
            h_next = h_next * masks[time] + initial[0] * (1 - masks[time])
            c_next = c_next * masks[time] + initial[1] * (1 - masks[time])
            output.append(h_next)
            if drop_masks is not None: h_next = h_next * drop_masks
            hx = (h_next, c_next)
        output.reverse()
        output = torch.stack(output, 0)
        return output, hx

    def forward(self, input, masks, initial=None):
        if self.batch_first:
            input = input.transpose(0, 1)  # transpose: return the transpose matrix
            masks = torch.unsqueeze(masks.transpose(0, 1), dim=2)
        max_time, batch_size, _ = input.size()
        masks = masks.expand(-1, -1, self.hidden_size)  # expand: -1 means not expand that dimension
        if initial is None:
            initial = Variable(input.data.new(batch_size, self.hidden_size).zero_())
            initial = (initial, initial)  # h0, c0
        h_n = []
        c_n = []

        for layer in range(self.num_layers):
            max_time, batch_size, input_size = input.size()
            input_mask, hidden_mask = None, None
            if self.training:  # when training, use the dropout
                input_mask = input.data.new(batch_size, input_size).fill_(1 - self.dropout_in)
                input_mask = Variable(torch.bernoulli(input_mask), requires_grad=False)
                input_mask = input_mask / (1 - self.dropout_in)
                # permute: exchange the dimension
                input_mask = torch.unsqueeze(input_mask, dim=2).expand(-1, -1, max_time).permute(2, 0, 1)
                input = input * input_mask

                hidden_mask = input.data.new(batch_size, self.hidden_size).fill_(1 - self.dropout_out)
                hidden_mask = Variable(torch.bernoulli(hidden_mask), requires_grad=False)
                hidden_mask = hidden_mask / (1 - self.dropout_out)

            layer_output, (layer_h_n, layer_c_n) = VariationalLSTM._forward_rnn(cell=self.fcells[layer], \
                                                                                input=input, masks=masks,
                                                                                initial=initial,
                                                                                drop_masks=hidden_mask)
            if self.bidirectional:
                blayer_output, (blayer_h_n, blayer_c_n) = VariationalLSTM._forward_brnn(cell=self.bcells[layer], \
                                                                                        input=input, masks=masks,
                                                                                        initial=initial,
                                                                                        drop_masks=hidden_mask)

            h_n.append(torch.cat([layer_h_n, blayer_h_n], 1) if self.bidirectional else layer_h_n)
            c_n.append(torch.cat([layer_c_n, blayer_c_n], 1) if self.bidirectional else layer_c_n)
            input = torch.cat([layer_output, blayer_output], 2) if self.bidirectional else layer_output

        h_n = torch.stack(h_n, 0)
        c_n = torch.stack(c_n, 0)
        if self.batch_first:
            input = input.transpose(1, 0)  # transpose: return the transpose matrix
        return input, (h_n, c_n)


================================================
FILE: hanlp/components/srl/span_rank/span_rank.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-09 18:13
import logging
from bisect import bisect
from typing import Union, List, Callable, Tuple, Dict, Any

from hanlp_common.constant import IDX
from hanlp.layers.transformers.utils import build_optimizer_scheduler_with_transformer
import torch
from torch.utils.data import DataLoader
from hanlp.common.dataset import PadSequenceDataLoader, SortingSampler
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import FieldLength
from hanlp.common.vocab import Vocab
from hanlp.components.srl.span_rank.inference_utils import srl_decode
from hanlp.components.srl.span_rank.span_ranking_srl_model import SpanRankingSRLModel
from hanlp.components.srl.span_rank.srl_eval_utils import compute_srl_f1
from hanlp.datasets.srl.loaders.conll2012 import CoNLL2012SRLDataset, filter_v_args, unpack_srl, \
    group_pa_by_p
from hanlp.layers.embeddings.embedding import Embedding
from hanlp.metrics.f1 import F1
from hanlp_common.visualization import markdown_table
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import merge_locals_kwargs, reorder


class SpanRankingSemanticRoleLabeler(TorchComponent):
    def __init__(self, **kwargs) -> None:
        """An implementation of "Jointly Predicting Predicates and Arguments in Neural Semantic Role Labeling"
        (:cite:`he-etal-2018-jointly`). It generates candidates triples of (predicate, arg_start, arg_end) and rank them.

        Args:
            **kwargs: Predefined config.
        """
        super().__init__(**kwargs)
        self.model: SpanRankingSRLModel = None

    def build_optimizer(self,
                        trn,
                        epochs,
                        lr,
                        adam_epsilon,
                        weight_decay,
                        warmup_steps,
                        transformer_lr,
                        **kwargs):
        # noinspection PyProtectedMember
        transformer = self._get_transformer()
        if transformer:
            num_training_steps = len(trn) * epochs // self.config.get('gradient_accumulation', 1)
            optimizer, scheduler = build_optimizer_scheduler_with_transformer(self.model,
                                                                              transformer,
                                                                              lr, transformer_lr,
                                                                              num_training_steps, warmup_steps,
                                                                              weight_decay, adam_epsilon)
        else:
            optimizer = torch.optim.Adam(self.model.parameters(), self.config.lr)
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer=optimizer,
                mode='max',
                factor=0.5,
                patience=2,
                verbose=True,
            )
        return optimizer, scheduler

    def _get_transformer(self):
        return getattr(self.model_.embed, 'transformer', None)

    def build_criterion(self, **kwargs):
        pass

    # noinspection PyProtectedMember
    def build_metric(self, **kwargs) -> Tuple[F1, F1]:
        predicate_f1 = F1()
        end_to_end_f1 = F1()
        return predicate_f1, end_to_end_f1

    def execute_training_loop(self,
                              trn: DataLoader,
                              dev: DataLoader,
                              epochs,
                              criterion,
                              optimizer,
                              metric,
                              save_dir,
                              logger: logging.Logger,
                              devices,
                              **kwargs):
        best_epoch, best_metric = 0, -1
        predicate, end_to_end = metric
        optimizer, scheduler = optimizer
        timer = CountdownTimer(epochs)
        ratio_width = len(f'{len(trn)}/{len(trn)}')
        for epoch in range(1, epochs + 1):
            logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
            self.fit_dataloader(trn, criterion, optimizer, metric, logger,
                                linear_scheduler=scheduler if self._get_transformer() else None)
            if dev:
                self.evaluate_dataloader(dev, criterion, metric, logger, ratio_width=ratio_width)
            report = f'{timer.elapsed_human}/{timer.total_time_human}'
            dev_score = end_to_end.score
            if not self._get_transformer():
                scheduler.step(dev_score)
            if dev_score > best_metric:
                self.save_weights(save_dir)
                best_metric = dev_score
                report += ' [red]saved[/red]'
            timer.log(report, ratio_percentage=False, newline=True, ratio=False)

    def fit_dataloader(self,
                       trn: DataLoader,
                       criterion,
                       optimizer,
                       metric,
                       logger: logging.Logger,
                       linear_scheduler=None,
                       gradient_accumulation=1,
                       **kwargs):
        self.model.train()
        timer = CountdownTimer(len(trn) // gradient_accumulation)
        total_loss = 0
        self.reset_metrics(metric)
        for idx, batch in enumerate(trn):
            output_dict = self.feed_batch(batch)
            self.update_metrics(batch, output_dict, metric)
            loss = output_dict['loss']
            loss = loss.sum()  # For data parallel
            if torch.isnan(loss):  # w/ gold pred, some batches do not have PAs at all, resulting in empty scores
                loss = torch.zeros((1,), device=loss.device)
            else:
                loss.backward()
            if gradient_accumulation and gradient_accumulation > 1:
                loss /= gradient_accumulation
            if self.config.grad_norm:
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm)
            if (idx + 1) % gradient_accumulation == 0:
                self._step(optimizer, linear_scheduler)
                timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
                          logger=logger)
            total_loss += loss.item()
            del loss
        if len(trn) % gradient_accumulation:
            self._step(optimizer, linear_scheduler)
        return total_loss / timer.total

    def _step(self, optimizer, linear_scheduler):
        optimizer.step()
        optimizer.zero_grad()
        if linear_scheduler:
            linear_scheduler.step()

    # noinspection PyMethodOverriding
    @torch.no_grad()
    def evaluate_dataloader(self,
                            data: DataLoader,
                            criterion: Callable,
                            metric,
                            logger,
                            ratio_width=None,
                            output=False,
                            official=False,
                            confusion_matrix=False,
                            **kwargs):
        self.model.eval()
        self.reset_metrics(metric)
        timer = CountdownTimer(len(data))
        total_loss = 0
        if official:
            sentences = []
            gold = []
            pred = []
        for batch in data:
            output_dict = self.feed_batch(batch)
            if official:
                sentences += batch['token']
                gold += batch['srl']
                pred += output_dict['prediction']
            self.update_metrics(batch, output_dict, metric)
            loss = output_dict['loss']
            total_loss += loss.item()
            timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
                      logger=logger,
                      ratio_width=ratio_width)
            del loss
        if official:
            scores = compute_srl_f1(sentences, gold, pred)
            if logger:
                if confusion_matrix:
                    labels = sorted(set(y for x in scores.label_confusions.keys() for y in x))
                    headings = ['GOLD↓PRED→'] + labels
                    matrix = []
                    for i, gold in enumerate(labels):
                        row = [gold]
                        matrix.append(row)
                        for j, pred in enumerate(labels):
                            row.append(scores.label_confusions.get((gold, pred), 0))
                    matrix = markdown_table(headings, matrix)
                    logger.info(f'{"Confusion Matrix": ^{len(matrix.splitlines()[0])}}')
                    logger.info(matrix)
                headings = ['Settings', 'Precision', 'Recall', 'F1']
                data = []
                for h, (p, r, f) in zip(['Unlabeled', 'Labeled', 'Official'], [
                    [scores.unlabeled_precision, scores.unlabeled_recall, scores.unlabeled_f1],
                    [scores.precision, scores.recall, scores.f1],
                    [scores.conll_precision, scores.conll_recall, scores.conll_f1],
                ]):
                    data.append([h] + [f'{x:.2%}' for x in [p, r, f]])
                table = markdown_table(headings, data)
                logger.info(f'{"Scores": ^{len(table.splitlines()[0])}}')
                logger.info(table)
        else:
            scores = metric
        return total_loss / timer.total, scores

    def build_model(self,
                    training=True,
                    **kwargs) -> torch.nn.Module:
        # noinspection PyTypeChecker
        # embed: torch.nn.Embedding = self.config.embed.module(vocabs=self.vocabs)[0].embed
        model = SpanRankingSRLModel(self.config,
                                    self.config.embed.module(vocabs=self.vocabs, training=training),
                                    self.config.context_layer,
                                    len(self.vocabs.srl_label))
        return model

    # noinspection PyMethodOverriding
    def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger,
                         generate_idx=False, transform=None, **kwargs) -> DataLoader:
        batch_max_tokens = self.config.batch_max_tokens
        gradient_accumulation = self.config.get('gradient_accumulation', 1)
        if batch_size:
            batch_size //= gradient_accumulation
        if batch_max_tokens:
            batch_max_tokens //= gradient_accumulation
        dataset = self.build_dataset(data, generate_idx, logger, transform)

        sampler = SortingSampler([x['token_length'] for x in dataset],
                                 batch_size=batch_size,
                                 batch_max_tokens=batch_max_tokens,
                                 shuffle=shuffle)
        return PadSequenceDataLoader(batch_sampler=sampler,
                                     device=device,
                                     dataset=dataset)

    def build_dataset(self, data, generate_idx, logger, transform=None):
        dataset = CoNLL2012SRLDataset(data, transform=[filter_v_args, unpack_srl, group_pa_by_p],
                                      doc_level_offset=self.config.doc_level_offset, generate_idx=generate_idx)
        if transform:
            dataset.append_transform(transform)
        if isinstance(self.config.get('embed', None), Embedding):
            transform = self.config.embed.transform(vocabs=self.vocabs)
            if transform:
                dataset.append_transform(transform)
        dataset.append_transform(self.vocabs)
        dataset.append_transform(FieldLength('token'))
        if isinstance(data, str):
            dataset.purge_cache()  # Enable cache
        if self.vocabs.mutable:
            self.build_vocabs(dataset, logger)
        return dataset

    def predict(self, data: Union[str, List[str]], batch_size: int = None, fmt='dict', **kwargs):
        if not data:
            return []
        flat = self.input_is_flat(data)
        if flat:
            data = [data]
        samples = []
        for token in data:
            sample = dict()
            sample['token'] = token
            samples.append(sample)
        batch_size = batch_size or self.config.batch_size
        dataloader = self.build_dataloader(samples, batch_size, False, self.device, None, generate_idx=True)
        outputs = []
        order = []
        for batch in dataloader:
            output_dict = self.feed_batch(batch)
            outputs.extend(output_dict['prediction'])
            order.extend(batch[IDX])
        outputs = reorder(outputs, order)
        if fmt == 'list':
            outputs = self.format_dict_to_results(data, outputs)
        if flat:
            return outputs[0]
        return outputs

    @staticmethod
    def format_dict_to_results(data, outputs, exclusive_offset=False, with_predicate=False, with_argument=False,
                               label_first=False):
        results = []
        for i in range(len(outputs)):
            tokens = data[i]
            output = []
            for p, a in outputs[i].items():
                # a: [(0, 0, 'ARG0')]
                if with_predicate:
                    a.insert(bisect([x[0] for x in a], p), (p, p, 'PRED'))
                if with_argument is not False:
                    a = [x + (tokens[x[0]:x[1] + 1],) for x in a]
                    if isinstance(with_argument, str):
                        a = [x[:-1] + (with_argument.join(x[-1]),) for x in a]
                if exclusive_offset:
                    a = [(x[0], x[1] + 1) + x[2:] for x in a]
                if label_first:
                    a = [tuple(reversed(x[2:])) + x[:2] for x in a]
                output.append(a)
            results.append(output)
        return results

    def input_is_flat(self, data):
        return isinstance(data[0], str)

    # noinspection PyMethodOverriding
    def fit(self,
            trn_data,
            dev_data,
            save_dir,
            embed,
            context_layer,
            batch_size=40,
            batch_max_tokens=700,
            lexical_dropout=0.5,
            dropout=0.2,
            span_width_feature_size=20,
            ffnn_size=150,
            ffnn_depth=2,
            argument_ratio=0.8,
            predicate_ratio=0.4,
            max_arg_width=30,
            mlp_label_size=100,
            enforce_srl_constraint=False,
            use_gold_predicates=False,
            doc_level_offset=True,
            use_biaffine=False,
            lr=1e-3,
            transformer_lr=1e-5,
            adam_epsilon=1e-6,
            weight_decay=0.01,
            warmup_steps=0.1,
            grad_norm=5.0,
            gradient_accumulation=1,
            loss_reduction='sum',
            transform=None,
            devices=None,
            logger=None,
            seed=None,
            **kwargs
            ):

        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def build_vocabs(self, dataset, logger, **kwargs):
        self.vocabs.srl_label = Vocab(pad_token=None, unk_token=None)
        # Use null to indicate no relationship
        self.vocabs.srl_label.add('<null>')
        timer = CountdownTimer(len(dataset))
        max_seq_len = 0
        for each in dataset:
            max_seq_len = max(max_seq_len, len(each['token_input_ids']))
            timer.log(f'Building vocabs (max sequence length {max_seq_len}) [blink][yellow]...[/yellow][/blink]')
            pass
        timer.stop()
        timer.erase()
        self.vocabs['srl_label'].set_unk_as_safe_unk()
        self.vocabs.lock()
        self.vocabs.summary(logger)

    def reset_metrics(self, metrics):
        for each in metrics:
            each.reset()

    def report_metrics(self, loss, metrics):
        predicate, end_to_end = metrics
        return f'loss: {loss:.4f} predicate: {predicate.score:.2%} end_to_end: {end_to_end.score:.2%}'

    def feed_batch(self, batch) -> Dict[str, Any]:
        output_dict = self.model(batch)
        prediction = self.decode_output(output_dict, batch, self.model.training)
        output_dict['prediction'] = prediction
        return output_dict

    def decode_output(self, output_dict, batch, training=False):
        idx_to_label = self.vocabs['srl_label'].idx_to_token
        if training:
            # Use fast decoding during training,
            prediction = []
            top_predicate_indices = output_dict['predicates'].tolist()
            top_spans = torch.stack([output_dict['arg_starts'], output_dict['arg_ends']], dim=-1).tolist()
            srl_mask = output_dict['srl_mask'].tolist()
            srl_scores = output_dict['srl_scores']
            pal_list = srl_scores.argmax(-1).tolist() if srl_scores.numel() else []
            for n, (pal, predicate_indices, argument_spans) in enumerate(
                    zip(pal_list, top_predicate_indices, top_spans)):
                srl_per_sentence = {}
                for p, (al, predicate_index) in enumerate(zip(pal, predicate_indices)):
                    for a, (l, argument_span) in enumerate(zip(al, argument_spans)):
                        if l and srl_mask[n][p][a]:
                            args = srl_per_sentence.get(p, None)
                            if args is None:
                                args = srl_per_sentence[p] = []
                            args.append((*argument_span, idx_to_label[l]))
                prediction.append(srl_per_sentence)
        else:
            prediction = srl_decode(batch['token_length'], output_dict, idx_to_label, self.config)
        return prediction

    def update_metrics(self, batch: dict, output_dict: dict, metrics):
        def unpack(y: dict):
            return set((p, bel) for p, a in y.items() for bel in a)

        predicate, end_to_end = metrics
        for pred, gold in zip(output_dict['prediction'], batch['srl']):
            predicate(pred.keys(), gold.keys())
            end_to_end(unpack(pred), unpack(gold))


================================================
FILE: hanlp/components/srl/span_rank/span_ranking_srl_model.py
================================================
from typing import Dict

import hanlp.utils.torch_util
from hanlp.layers.feedforward import FeedForward
from hanlp.layers.time_distributed import TimeDistributed

from .highway_variational_lstm import *
import torch

from ...parsers.biaffine.biaffine import Biaffine


def initializer_1d(input_tensor, initializer):
    assert len(input_tensor.size()) == 1
    input_tensor = input_tensor.view(-1, 1)
    input_tensor = initializer(input_tensor)
    return input_tensor.view(-1)


class SpanRankingSRLDecoder(nn.Module):

    def __init__(self, context_layer_output_dim, label_space_size, config) -> None:
        super().__init__()
        self.config = config
        self.label_space_size = label_space_size
        self.dropout = float(config.dropout)
        self.use_gold_predicates = config.use_gold_predicates
        # span width feature embedding
        self.span_width_embedding = nn.Embedding(self.config.max_arg_width, self.config.span_width_feature_size)
        # self.context_projective_layer = nn.Linear(2 * self.lstm_hidden_size, self.config.num_attention_heads)
        # span scores
        self.span_emb_size = 3 * context_layer_output_dim + self.config.span_width_feature_size
        self.arg_unary_score_layers = nn.ModuleList([nn.Linear(self.span_emb_size, self.config.ffnn_size) if i == 0
                                                     else nn.Linear(self.config.ffnn_size, self.config.ffnn_size) for i
                                                     in range(self.config.ffnn_depth)])  # [,150]
        self.arg_dropout_layers = nn.ModuleList([nn.Dropout(self.dropout) for _ in range(self.config.ffnn_depth)])
        self.arg_unary_score_projection = nn.Linear(self.config.ffnn_size, 1)
        # predicate scores
        self.pred_unary_score_layers = nn.ModuleList(
            [nn.Linear(context_layer_output_dim, self.config.ffnn_size) if i == 0
             else nn.Linear(self.config.ffnn_size, self.config.ffnn_size) for i
             in range(self.config.ffnn_depth)])  # [,150]
        self.pred_dropout_layers = nn.ModuleList([nn.Dropout(self.dropout) for _ in range(self.config.ffnn_depth)])
        self.pred_unary_score_projection = nn.Linear(self.config.ffnn_size, 1)
        # srl scores
        self.srl_unary_score_input_size = self.span_emb_size + context_layer_output_dim
        self.srl_unary_score_layers = nn.ModuleList([nn.Linear(self.srl_unary_score_input_size, self.config.ffnn_size)
                                                     if i == 0 else nn.Linear(self.config.ffnn_size,
                                                                              self.config.ffnn_size)
                                                     for i in range(self.config.ffnn_depth)])
        self.srl_dropout_layers = nn.ModuleList([nn.Dropout(self.dropout) for _ in range(self.config.ffnn_depth)])
        self.srl_unary_score_projection = nn.Linear(self.config.ffnn_size, self.label_space_size - 1)
        if config.use_biaffine:
            self.predicate_scale = TimeDistributed(FeedForward(context_layer_output_dim, 1, self.span_emb_size, 'ReLU'))
            self.biaffine = Biaffine(self.span_emb_size, self.label_space_size - 1)
        self.loss_reduction = config.loss_reduction
        self.reset_parameters()

    def reset_parameters(self):
        init.xavier_uniform_(self.span_width_embedding.weight)
        # init.xavier_uniform_(self.context_projective_layer.weight)
        # initializer_1d(self.context_projective_layer.bias, init.xavier_uniform_)

        for layer in self.arg_unary_score_layers:
            init.xavier_uniform_(layer.weight)
            initializer_1d(layer.bias, init.xavier_uniform_)
        init.xavier_uniform_(self.arg_unary_score_projection.weight)
        initializer_1d(self.arg_unary_score_projection.bias, init.xavier_uniform_)

        for layer in self.pred_unary_score_layers:
            init.xavier_uniform_(layer.weight)
            initializer_1d(layer.bias, init.xavier_uniform_)
        init.xavier_uniform_(self.pred_unary_score_projection.weight)
        initializer_1d(self.pred_unary_score_projection.bias, init.xavier_uniform_)

        for layer in self.srl_unary_score_layers:
            init.xavier_uniform_(layer.weight)
            initializer_1d(layer.bias, init.xavier_uniform_)
        init.xavier_uniform_(self.srl_unary_score_projection.weight)
        initializer_1d(self.srl_unary_score_projection.bias, init.xavier_uniform_)
        return None

    def forward(self, hidden_states, batch, mask=None):
        gold_arg_ends, gold_arg_labels, gold_arg_starts, gold_predicates, masks, sent_lengths = SpanRankingSRLModel.unpack(
            batch, mask=mask, training=self.training)
        return self.decode(hidden_states, sent_lengths, masks, gold_arg_starts, gold_arg_ends, gold_arg_labels,
                           gold_predicates)

    @staticmethod
    def get_candidate_spans(sent_lengths: torch.Tensor, max_sent_length, max_arg_width):
        num_sentences = len(sent_lengths)
        device = sent_lengths.device
        candidate_starts = torch.arange(0, max_sent_length, device=device).expand(num_sentences, max_arg_width, -1)
        candidate_width = torch.arange(0, max_arg_width, device=device).view(1, -1, 1)
        candidate_ends = candidate_starts + candidate_width

        candidate_starts = candidate_starts.contiguous().view(num_sentences, max_sent_length * max_arg_width)
        candidate_ends = candidate_ends.contiguous().view(num_sentences, max_sent_length * max_arg_width)
        actual_sent_lengths = sent_lengths.view(-1, 1).expand(-1, max_sent_length * max_arg_width)
        candidate_mask = candidate_ends < actual_sent_lengths

        candidate_starts = candidate_starts * candidate_mask
        candidate_ends = candidate_ends * candidate_mask
        return candidate_starts, candidate_ends, candidate_mask

    @staticmethod
    def exclusive_cumsum(input: torch.Tensor, exclusive=True):
        """

        Args:
          input: input is the sentence lengths tensor.
          exclusive: exclude the last sentence length (Default value = True)
          input(torch.Tensor :): 
          input: torch.Tensor: 

        Returns:

        
        """
        assert exclusive is True
        if exclusive is True:
            exclusive_sent_lengths = input.new_zeros(1, dtype=torch.long)
            result = torch.cumsum(torch.cat([exclusive_sent_lengths, input], 0)[:-1], 0).view(-1, 1)
        else:
            result = torch.cumsum(input, 0).view(-1, 1)
        return result

    def flatten_emb(self, emb):
        num_sentences, max_sentence_length = emb.size()[0], emb.size()[1]
        assert len(emb.size()) == 3
        flatted_emb = emb.contiguous().view(num_sentences * max_sentence_length, -1)
        return flatted_emb

    def flatten_emb_in_sentence(self, emb, batch_sentences_mask):
        num_sentences, max_sentence_length = emb.size()[0], emb.size()[1]
        flatted_emb = self.flatten_emb(emb)
        return flatted_emb[batch_sentences_mask.reshape(num_sentences * max_sentence_length)]

    def get_span_emb(self, flatted_context_emb, flatted_candidate_starts, flatted_candidate_ends,
                     config, dropout=0.0):
        batch_word_num = flatted_context_emb.size()[0]
        # gather slices from embeddings according to indices
        span_start_emb = flatted_context_emb[flatted_candidate_starts]
        span_end_emb = flatted_context_emb[flatted_candidate_ends]
        span_emb_feature_list = [span_start_emb, span_end_emb]  # store the span vector representations for span rep.

        span_width = 1 + flatted_candidate_ends - flatted_candidate_starts  # [num_spans], generate the span width
        max_arg_width = config.max_arg_width

        # get the span width feature emb
        span_width_index = span_width - 1
        span_width_emb = self.span_width_embedding(span_width_index)
        span_width_emb = F.dropout(span_width_emb, dropout, self.training)
        span_emb_feature_list.append(span_width_emb)

        """head features"""
        cpu_flatted_candidte_starts = flatted_candidate_starts
        span_indices = torch.arange(0, max_arg_width, device=flatted_context_emb.device).view(1, -1) + \
                       cpu_flatted_candidte_starts.view(-1, 1)  # For all the i, where i in [begin, ..i, end] for span
        # reset the position index to the batch_word_num index with index - 1
        span_indices = torch.clamp(span_indices, max=batch_word_num - 1)
        num_spans, spans_width = span_indices.size()[0], span_indices.size()[1]
        flatted_span_indices = span_indices.view(-1)  # so Huge!!!, column is the span?
        # if torch.cuda.is_available():
        flatted_span_indices = flatted_span_indices
        span_text_emb = flatted_context_emb.index_select(0, flatted_span_indices).view(num_spans, spans_width, -1)
        span_indices_mask = hanlp.utils.torch_util.lengths_to_mask(span_width, max_len=max_arg_width)
        # project context output to num head
        # head_scores = self.context_projective_layer.forward(flatted_context_emb)
        # get span attention
        # span_attention = head_scores.index_select(0, flatted_span_indices).view(num_spans, spans_width)
        # span_attention = torch.add(span_attention, expanded_span_indices_log_mask).unsqueeze(2)  # control the span len
        # span_attention = F.softmax(span_attention, dim=1)
        span_text_emb = span_text_emb * span_indices_mask.unsqueeze(2).expand(-1, -1, span_text_emb.size()[-1])
        span_head_emb = torch.mean(span_text_emb, 1)
        span_emb_feature_list.append(span_head_emb)

        span_emb = torch.cat(span_emb_feature_list, 1)
        return span_emb, None, span_text_emb, span_indices, span_indices_mask

    def get_arg_unary_scores(self, span_emb):
        """Compute span score with FFNN(span embedding)

        Args:
          span_emb: tensor of [num_sentences, num_spans, emb_size]
          config: param dropout:
          num_labels: param name:

        Returns:

        
        """
        input = span_emb
        for i, ffnn in enumerate(self.arg_unary_score_layers):
            input = F.relu(ffnn.forward(input))
            input = self.arg_dropout_layers[i].forward(input)
        output = self.arg_unary_score_projection.forward(input)
        return output

    def get_pred_unary_scores(self, span_emb):
        input = span_emb
        for i, ffnn in enumerate(self.pred_unary_score_layers):
            input = F.relu(ffnn.forward(input))
            input = self.pred_dropout_layers[i].forward(input)
        output = self.pred_unary_score_projection.forward(input)
        return output

    def extract_spans(self, candidate_scores, candidate_starts, candidate_ends, topk, max_sentence_length,
                      sort_spans, enforce_non_crossing):
        """extract the topk span indices

        Args:
          candidate_scores: param candidate_starts:
          candidate_ends: param topk: [num_sentences]
          max_sentence_length: param sort_spans:
          enforce_non_crossing: return: indices [num_sentences, max_num_predictions]
          candidate_starts: 
          topk: 
          sort_spans: 

        Returns:

        
        """
        # num_sentences = candidate_scores.size()[0]
        # num_input_spans = candidate_scores.size()[1]
        max_num_output_spans = int(torch.max(topk))
        indices = [score.topk(k)[1] for score, k in zip(candidate_scores, topk)]
        output_span_indices_tensor = [F.pad(item, [0, max_num_output_spans - item.size()[0]], value=item[-1])
                                      for item in indices]
        output_span_indices_tensor = torch.stack(output_span_indices_tensor)
        return output_span_indices_tensor

    def batch_index_select(self, emb, indices):
        num_sentences = emb.size()[0]
        max_sent_length = emb.size()[1]
        flatten_emb = self.flatten_emb(emb)
        offset = (torch.arange(0, num_sentences, device=emb.device) * max_sent_length).unsqueeze(1)
        return torch.index_select(flatten_emb, 0, (indices + offset).view(-1)) \
            .view(indices.size()[0], indices.size()[1], emb.size(-1))

    def get_batch_topk(self, candidate_starts: torch.Tensor, candidate_ends, candidate_scores, topk_ratio, text_len,
                       max_sentence_length, sort_spans=False, enforce_non_crossing=True):
        num_sentences = candidate_starts.size()[0]
        max_sentence_length = candidate_starts.size()[1]

        topk = torch.floor(text_len.to(torch.float) * topk_ratio).to(torch.long)
        topk = torch.max(topk, torch.ones(num_sentences, device=candidate_starts.device, dtype=torch.long))

        # this part should be implemented with C++
        predicted_indices = self.extract_spans(candidate_scores, candidate_starts, candidate_ends, topk,
                                               max_sentence_length, sort_spans, enforce_non_crossing)
        predicted_starts = torch.gather(candidate_starts, 1, predicted_indices)
        predicted_ends = torch.gather(candidate_ends, 1, predicted_indices)
        predicted_scores = torch.gather(candidate_scores, 1, predicted_indices)
        return predicted_starts, predicted_ends, predicted_scores, topk, predicted_indices

    def get_dense_span_labels(self, span_starts, span_ends, span_labels, max_sentence_length,
                              span_parents=None):
        num_sentences = span_starts.size()[0]
        max_spans_num = span_starts.size()[1]

        # span_starts = span_starts + 1 - (span_labels > 0).to(torch.long)
        span_starts[(span_labels == 0) & (span_starts < max_sentence_length - 1)] += 1  # make start > end
        sentence_indices = torch.arange(0, num_sentences, device=span_starts.device).unsqueeze(1).expand(-1,
                                                                                                         max_spans_num)

        sparse_indices = torch.cat([sentence_indices.unsqueeze(2), span_starts.unsqueeze(2), span_ends.unsqueeze(2)],
                                   dim=2)
        if span_parents is not None:  # semantic span predicate offset
            sparse_indices = torch.cat([sparse_indices, span_parents.unsqueeze(2)], 2)

        rank = 3 if span_parents is None else 4
        dense_labels = torch.sparse.LongTensor(sparse_indices.view(num_sentences * max_spans_num, rank).t(),
                                               span_labels.view(-1),
                                               torch.Size([num_sentences] + [max_sentence_length] * (rank - 1))) \
            .to_dense()
        return dense_labels

    @staticmethod
    def gather_4d(params, indices):
        assert len(params.size()) == 4 and len(indices) == 4
        indices_a, indices_b, indices_c, indices_d = indices
        result = params[indices_a, indices_b, indices_c, indices_d]
        return result

    def get_srl_labels(self,
                       arg_starts,
                       arg_ends,
                       predicates,
                       gold_predicates,
                       gold_arg_starts,
                       gold_arg_ends,
                       gold_arg_labels,
                       max_sentence_length
                       ):
        num_sentences = arg_starts.size()[0]
        max_arg_num = arg_starts.size()[1]
        max_pred_num = predicates.size()[1]

        sentence_indices_2d = torch.arange(0, num_sentences, device=arg_starts.device).unsqueeze(1).unsqueeze(2).expand(
            -1, max_arg_num, max_pred_num)
        expanded_arg_starts = arg_starts.unsqueeze(2).expand(-1, -1, max_pred_num)
        expanded_arg_ends = arg_ends.unsqueeze(2).expand(-1, -1, max_pred_num)
        expanded_predicates = predicates.unsqueeze(1).expand(-1, max_arg_num, -1)

        dense_srl_labels = self.get_dense_span_labels(gold_arg_starts,
                                                      gold_arg_ends,
                                                      gold_arg_labels,
                                                      max_sentence_length, span_parents=gold_predicates)  # ans
        srl_labels = self.gather_4d(dense_srl_labels,
                                    [sentence_indices_2d, expanded_arg_starts, expanded_arg_ends, expanded_predicates])
        return srl_labels

    def get_srl_unary_scores(self, span_emb):
        input = span_emb
        for i, ffnn in enumerate(self.srl_unary_score_layers):
            input = F.relu(ffnn.forward(input))
            input = self.srl_dropout_layers[i].forward(input)
        output = self.srl_unary_score_projection.forward(input)
        return output

    def get_srl_scores(self, arg_emb, pred_emb, arg_scores, pred_scores, num_labels, config, dropout):
        num_sentences = arg_emb.size()[0]
        num_args = arg_emb.size()[1]  # [batch_size, max_arg_num, arg_emb_size]
        num_preds = pred_emb.size()[1]  # [batch_size, max_pred_num, pred_emb_size]

        unsqueezed_arg_emb = arg_emb.unsqueeze(2)
        unsqueezed_pred_emb = pred_emb.unsqueeze(1)
        expanded_arg_emb = unsqueezed_arg_emb.expand(-1, -1, num_preds, -1)
        expanded_pred_emb = unsqueezed_pred_emb.expand(-1, num_args, -1, -1)
        pair_emb_list = [expanded_arg_emb, expanded_pred_emb]
        pair_emb = torch.cat(pair_emb_list, 3)  # concatenate the argument emb and pre emb
        pair_emb_size = pair_emb.size()[3]
        flat_pair_emb = pair_emb.view(num_sentences * num_args * num_preds, pair_emb_size)
        # get unary scores
        flat_srl_scores = self.get_srl_unary_scores(flat_pair_emb)
        srl_scores = flat_srl_scores.view(num_sentences, num_args, num_preds, flat_srl_scores.size(-1))
        if self.config.use_biaffine:
            srl_scores += self.biaffine(arg_emb, self.predicate_scale(pred_emb)).permute([0, 2, 3, 1])
        unsqueezed_arg_scores, unsqueezed_pred_scores = \
            arg_scores.unsqueeze(2).unsqueeze(3), pred_scores.unsqueeze(1).unsqueeze(3)
        srl_scores = srl_scores + unsqueezed_arg_scores + unsqueezed_pred_scores
        dummy_scores = torch.zeros([num_sentences, num_args, num_preds, 1], device=arg_emb.device)
        srl_scores = torch.cat([dummy_scores, srl_scores], 3)
        return srl_scores

    def get_srl_softmax_loss(self, srl_scores, srl_labels, num_predicted_args, num_predicted_preds):
        srl_loss_mask = self.get_srl_loss_mask(srl_scores, num_predicted_args, num_predicted_preds)

        loss = torch.nn.functional.cross_entropy(srl_scores[srl_loss_mask], srl_labels[srl_loss_mask],
                                                 reduction=self.loss_reduction)
        return loss, srl_loss_mask

    def get_srl_loss_mask(self, srl_scores, num_predicted_args, num_predicted_preds):
        max_num_arg = srl_scores.size()[1]
        max_num_pred = srl_scores.size()[2]
        # num_predicted_args, 1D tensor; max_num_arg: a int variable means the gold ans's max arg number
        args_mask = hanlp.utils.torch_util.lengths_to_mask(num_predicted_args, max_num_arg)
        pred_mask = hanlp.utils.torch_util.lengths_to_mask(num_predicted_preds, max_num_pred)
        srl_loss_mask = args_mask.unsqueeze(2) & pred_mask.unsqueeze(1)
        return srl_loss_mask

    def decode(self, contextualized_embeddings, sent_lengths, masks, gold_arg_starts, gold_arg_ends, gold_arg_labels,
               gold_predicates):
        num_sentences, max_sent_length = masks.size()
        device = sent_lengths.device
        """generate candidate spans with argument pruning"""
        # candidate_starts [num_sentences, max_sent_length * max_arg_width]
        candidate_starts, candidate_ends, candidate_mask = self.get_candidate_spans(
            sent_lengths, max_sent_length, self.config.max_arg_width)
        flatted_candidate_mask = candidate_mask.view(-1)
        batch_word_offset = self.exclusive_cumsum(sent_lengths)  # get the word offset in a batch
        # choose the flatted_candidate_starts with the actual existing positions, i.e. exclude the illegal starts
        flatted_candidate_starts = candidate_starts + batch_word_offset
        flatted_candidate_starts = flatted_candidate_starts.view(-1)[flatted_candidate_mask].to(torch.long)
        flatted_candidate_ends = candidate_ends + batch_word_offset
        flatted_candidate_ends = flatted_candidate_ends.view(-1)[flatted_candidate_mask].to(torch.long)
        # flatten the lstm output according to the sentence mask, i.e. exclude the illegal (padding) lstm output
        flatted_context_output = self.flatten_emb_in_sentence(contextualized_embeddings, masks)
        """generate the span embedding"""
        candidate_span_emb, head_scores, span_head_emb, head_indices, head_indices_log_mask = self.get_span_emb(
            flatted_context_output, flatted_candidate_starts, flatted_candidate_ends,
            self.config, dropout=self.dropout)
        """Get the span ids"""
        candidate_span_number = candidate_span_emb.size()[0]
        max_candidate_spans_num_per_sentence = candidate_mask.size()[1]
        sparse_indices = candidate_mask.nonzero(as_tuple=False)
        sparse_values = torch.arange(0, candidate_span_number, device=device)
        candidate_span_ids = torch.sparse.FloatTensor(sparse_indices.t(), sparse_values,
                                                      torch.Size([num_sentences,
                                                                  max_candidate_spans_num_per_sentence])).to_dense()
        spans_log_mask = torch.log(candidate_mask.to(torch.float))
        predict_dict = {"candidate_starts": candidate_starts, "candidate_ends": candidate_ends,
                        "head_scores": head_scores}
        """Get unary scores and topk of candidate argument spans."""
        flatted_candidate_arg_scores = self.get_arg_unary_scores(candidate_span_emb)
        candidate_arg_scores = flatted_candidate_arg_scores.index_select(0, candidate_span_ids.view(-1)) \
            .view(candidate_span_ids.size()[0], candidate_span_ids.size()[1])
        candidate_arg_scores = candidate_arg_scores + spans_log_mask
        arg_starts, arg_ends, arg_scores, num_args, top_arg_indices = \
            self.get_batch_topk(candidate_starts, candidate_ends, candidate_arg_scores,
                                self.config.argument_ratio, sent_lengths, max_sent_length,
                                sort_spans=False, enforce_non_crossing=False)
        """Get the candidate predicate"""
        candidate_pred_ids = torch.arange(0, max_sent_length, device=device).unsqueeze(0).expand(num_sentences, -1)
        candidate_pred_emb = contextualized_embeddings
        candidate_pred_scores = self.get_pred_unary_scores(candidate_pred_emb)
        candidate_pred_scores = candidate_pred_scores + torch.log(masks.to(torch.float).unsqueeze(2))
        candidate_pred_scores = candidate_pred_scores.squeeze(2)
        if self.use_gold_predicates is True:
            predicates = gold_predicates
            num_preds = (gold_arg_labels > 0).sum(dim=-1)
            pred_scores = torch.zeros_like(predicates)
            top_pred_indices = predicates
        else:
            predicates, _, pred_scores, num_preds, top_pred_indices = self.get_batch_topk(
                candidate_pred_ids, candidate_pred_ids, candidate_pred_scores, self.config.predicate_ratio,
                sent_lengths, max_sent_length,
                sort_spans=False, enforce_non_crossing=False)
        """Get top arg embeddings"""
        arg_span_indices = torch.gather(candidate_span_ids, 1, top_arg_indices)  # [num_sentences, max_num_args]
        arg_emb = candidate_span_emb.index_select(0, arg_span_indices.view(-1)).view(
            arg_span_indices.size()[0], arg_span_indices.size()[1], -1
        )  # [num_sentences, max_num_args, emb]
        """Get top predicate embeddings"""
        pred_emb = self.batch_index_select(candidate_pred_emb,
                                           top_pred_indices)  # [num_sentences, max_num_preds, emb]
        """Get the srl scores according to the arg emb and pre emb."""
        srl_scores = self.get_srl_scores(arg_emb, pred_emb, arg_scores, pred_scores, self.label_space_size, self.config,
                                         self.dropout)  # [num_sentences, max_num_args, max_num_preds, num_labels]
        if gold_arg_labels is not None:
            """Get the answers according to the labels"""
            srl_labels = self.get_srl_labels(arg_starts, arg_ends, predicates, gold_predicates, gold_arg_starts,
                                             gold_arg_ends, gold_arg_labels, max_sent_length)

            """Compute the srl loss"""
            srl_loss, srl_mask = self.get_srl_softmax_loss(srl_scores, srl_labels, num_args, num_preds)
            predict_dict.update({
                'srl_mask': srl_mask,
                'loss': srl_loss
            })
        else:
            predict_dict['srl_mask'] = self.get_srl_loss_mask(srl_scores, num_args, num_preds)
        predict_dict.update({
            "candidate_arg_scores": candidate_arg_scores,
            "candidate_pred_scores": candidate_pred_scores,
            "predicates": predicates,
            "arg_starts": arg_starts,
            "arg_ends": arg_ends,
            "arg_scores": arg_scores,
            "pred_scores": pred_scores,
            "num_args": num_args,
            "num_preds": num_preds,
            # [num_sentences, num_args, num_preds] avoid max on empty tensor
            # "arg_labels": torch.max(srl_scores, 1)[1] if srl_scores.numel() else srl_scores[:, :, :, 0],
            "srl_scores": srl_scores,
        })
        return predict_dict


class SpanRankingSRLModel(nn.Module):

    def __init__(self, config, embed: torch.nn.Module, context_layer: torch.nn.Module, label_space_size):
        super(SpanRankingSRLModel, self).__init__()
        self.config = config
        self.dropout = float(config.dropout)
        self.lexical_dropout = float(self.config.lexical_dropout)
        self.label_space_size = label_space_size

        # Initialize layers and parameters
        self.word_embedding_dim = embed.get_output_dim()  # get the embedding dim
        self.embed = embed
        # Initialize context layer
        self.context_layer = context_layer
        context_layer_output_dim = context_layer.get_output_dim() if context_layer else self.word_embedding_dim
        self.decoder = SpanRankingSRLDecoder(context_layer_output_dim, label_space_size, config)

    def forward(self,
                batch: Dict[str, torch.Tensor]
                ):
        gold_arg_ends, gold_arg_labels, gold_arg_starts, gold_predicates, masks, sent_lengths = \
            self.unpack(batch, training=self.training)

        context_embeddings = self.embed(batch)
        context_embeddings = F.dropout(context_embeddings, self.lexical_dropout, self.training)
        if self.context_layer:
            context_embeddings = self.context_layer(context_embeddings, masks)

        return self.decoder.decode(context_embeddings, sent_lengths, masks, gold_arg_starts, gold_arg_ends,
                                   gold_arg_labels, gold_predicates)

    @staticmethod
    def unpack(batch, mask=None, training=False):
        keys = 'token_length', 'predicate_offset', 'argument_begin_offset', 'argument_end_offset', 'srl_label_id'
        sent_lengths, gold_predicates, gold_arg_starts, gold_arg_ends, gold_arg_labels = [batch.get(k, None) for k in
                                                                                          keys]
        if mask is None:
            mask = hanlp.utils.torch_util.lengths_to_mask(sent_lengths)
        # elif not training:
        #     sent_lengths = mask.sum(dim=1)
        return gold_arg_ends, gold_arg_labels, gold_arg_starts, gold_predicates, mask, sent_lengths


================================================
FILE: hanlp/components/srl/span_rank/srl_eval_utils.py
================================================
# Evaluation util functions for PropBank SRL.

import codecs
import collections
import operator
import tempfile
from collections import Counter

from hanlp.metrics.srl.srlconll import official_conll_05_evaluate

_SRL_CONLL_EVAL_SCRIPT = "../run_eval.sh"


def split_example_for_eval(example):
    """Split document-based samples into sentence-based samples for evaluation.

    Args:
      example: 

    Returns:

    
    """
    sentences = example["sentences"]
    num_words = sum(len(s) for s in sentences)
    word_offset = 0
    samples = []
    # assert len(sentences) == 1
    for i, sentence in enumerate(sentences):
        # assert i == 0  # For CoNLL-2005, there are always document == sentence.
        srl_rels = {}
        ner_spans = []  # Unused.
        for r in example["srl"][i]:
            pred_id = r[0] - word_offset
            if pred_id not in srl_rels:
                srl_rels[pred_id] = []
            srl_rels[pred_id].append((r[1] - word_offset, r[2] - word_offset, r[3]))
        samples.append((sentence, srl_rels, ner_spans))
        word_offset += len(sentence)
    return samples


def evaluate_retrieval(span_starts, span_ends, span_scores, pred_starts, pred_ends, gold_spans,
                       text_length, evaluators, debugging=False):
    """Evaluation for unlabeled retrieval.

    Args:
      gold_spans: Set of tuples of (start, end).
      span_starts: 
      span_ends: 
      span_scores: 
      pred_starts: 
      pred_ends: 
      text_length: 
      evaluators: 
      debugging: (Default value = False)

    Returns:

    
    """
    if len(span_starts) > 0:
        sorted_starts, sorted_ends, sorted_scores = list(zip(*sorted(
            zip(span_starts, span_ends, span_scores),
            key=operator.itemgetter(2), reverse=True)))
    else:
        sorted_starts = []
        sorted_ends = []
    for k, evaluator in list(evaluators.items()):
        if k == -3:
            predicted_spans = set(zip(span_starts, span_ends)) & gold_spans
        else:
            if k == -2:
                predicted_starts = pred_starts
                predicted_ends = pred_ends
                if debugging:
                    print("Predicted", list(zip(sorted_starts, sorted_ends, sorted_scores))[:len(gold_spans)])
                    print("Gold", gold_spans)
            # FIXME: scalar index error
            elif k == 0:
                is_predicted = span_scores > 0
                predicted_starts = span_starts[is_predicted]
                predicted_ends = span_ends[is_predicted]
            else:
                if k == -1:
                    num_predictions = len(gold_spans)
                else:
                    num_predictions = (k * text_length) / 100
                predicted_starts = sorted_starts[:num_predictions]
                predicted_ends = sorted_ends[:num_predictions]
            predicted_spans = set(zip(predicted_starts, predicted_ends))
        evaluator.update(gold_set=gold_spans, predicted_set=predicted_spans)


def _calc_f1(total_gold, total_predicted, total_matched, message=None):
    precision = total_matched / total_predicted if total_predicted > 0 else 0
    recall = total_matched / total_gold if total_gold > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    if message:
        print(("{}: Precision: {:.2%} Recall: {:.2%} F1: {:.2%}".format(message, precision, recall, f1)))
    return precision, recall, f1


def compute_span_f1(gold_data, predictions, task_name):
    assert len(gold_data) == len(predictions)
    total_gold = 0
    total_predicted = 0
    total_matched = 0
    total_unlabeled_matched = 0
    label_confusions = Counter()  # Counter of (gold, pred) label pairs.

    for i in range(len(gold_data)):
        gold = gold_data[i]
        pred = predictions[i]
        total_gold += len(gold)
        total_predicted += len(pred)
        for a0 in gold:
            for a1 in pred:
                if a0[0] == a1[0] and a0[1] == a1[1]:
                    total_unlabeled_matched += 1
                    label_confusions.update([(a0[2], a1[2]), ])
                    if a0[2] == a1[2]:
                        total_matched += 1
    prec, recall, f1 = _calc_f1(total_gold, total_predicted, total_matched, task_name)
    ul_prec, ul_recall, ul_f1 = _calc_f1(total_gold, total_predicted, total_unlabeled_matched,
                                         "Unlabeled " + task_name)
    return prec, recall, f1, ul_prec, ul_recall, ul_f1, label_confusions


def compute_unlabeled_span_f1(gold_data, predictions, task_name):
    assert len(gold_data) == len(predictions)
    total_gold = 0
    total_predicted = 0
    total_matched = 0
    total_unlabeled_matched = 0
    label_confusions = Counter()  # Counter of (gold, pred) label pairs.

    for i in range(len(gold_data)):
        gold = gold_data[i]
        pred = predictions[i]
        total_gold += len(gold)
        total_predicted += len(pred)
        for a0 in gold:
            for a1 in pred:
                if a0[0] == a1[0] and a0[1] == a1[1]:
                    total_unlabeled_matched += 1
                    label_confusions.update([(a0[2], a1[2]), ])
                    if a0[2] == a1[2]:
                        total_matched += 1
    prec, recall, f1 = _calc_f1(total_gold, total_predicted, total_matched, task_name)
    ul_prec, ul_recall, ul_f1 = _calc_f1(total_gold, total_predicted, total_unlabeled_matched,
                                         "Unlabeled " + task_name)
    return prec, recall, f1, ul_prec, ul_recall, ul_f1, label_confusions


SRLScores = collections.namedtuple('SRLScores',
                                   ['unlabeled_precision', 'unlabeled_recall', 'unlabeled_f1', 'precision', 'recall',
                                    'f1', 'conll_precision', 'conll_recall', 'conll_f1', 'label_confusions',
                                    'num_sents'])


def compute_srl_f1(sentences, gold_srl, predictions, gold_path=None) -> SRLScores:
    assert len(gold_srl) == len(predictions)
    total_gold = 0
    total_predicted = 0
    total_matched = 0
    total_unlabeled_matched = 0
    num_sents = 0
    label_confusions = Counter()

    # Compute unofficial F1 of SRL relations.
    for gold, prediction in zip(gold_srl, predictions):
        gold_rels = 0
        pred_rels = 0
        matched = 0
        for pred_id, gold_args in gold.items():
            filtered_gold_args = [a for a in gold_args if a[2] not in ["V", "C-V"]]
            total_gold += len(filtered_gold_args)
            gold_rels += len(filtered_gold_args)
            if pred_id not in prediction:
                continue
            for a0 in filtered_gold_args:
                for a1 in prediction[pred_id]:
                    if a0[0] == a1[0] and a0[1] == a1[1]:
                        total_unlabeled_matched += 1
                        label_confusions.update([(a0[2], a1[2]), ])
                        if a0[2] == a1[2]:
                            total_matched += 1
                            matched += 1
        for pred_id, args in prediction.items():
            filtered_args = [a for a in args if a[2] not in ["V"]]  # "C-V"]]
            total_predicted += len(filtered_args)
            pred_rels += len(filtered_args)

        if gold_rels == matched and pred_rels == matched:
            num_sents += 1

    precision, recall, f1 = _calc_f1(total_gold, total_predicted, total_matched,
                                     # "SRL (unofficial)"
                                     )
    unlabeled_precision, unlabeled_recall, unlabeled_f1 = _calc_f1(total_gold, total_predicted,
                                                                   total_unlabeled_matched,
                                                                   # "Unlabeled SRL (unofficial)"
                                                                   )

    # Prepare to compute official F1.
    if not gold_path:
        # print("No gold conll_eval data provided. Recreating ...")
        gold_path = tempfile.NamedTemporaryFile().name
        print_to_conll(sentences, gold_srl, gold_path, None)
        gold_predicates = None
    else:
        gold_predicates = read_gold_predicates(gold_path)

    temp_output = tempfile.NamedTemporaryFile().name
    # print(("Output temp outoput {}".format(temp_output)))
    print_to_conll(sentences, predictions, temp_output, gold_predicates)

    # Evaluate twice with official script.
    conll_precision, conll_recall, conll_f1 = official_conll_05_evaluate(temp_output, gold_path)
    return SRLScores(unlabeled_precision, unlabeled_recall, unlabeled_f1, precision, recall, f1, conll_precision,
                     conll_recall, conll_f1, label_confusions, num_sents)


def print_sentence_to_conll(fout, tokens, labels):
    """Print a labeled sentence into CoNLL format.

    Args:
      fout: 
      tokens: 
      labels: 

    Returns:

    
    """
    for label_column in labels:
        assert len(label_column) == len(tokens)
    for i in range(len(tokens)):
        fout.write(tokens[i].ljust(15))
        for label_column in labels:
            fout.write(label_column[i].rjust(15))
        fout.write("\n")
    fout.write("\n")


def read_gold_predicates(gold_path):
    print("gold path", gold_path)
    fin = codecs.open(gold_path, "r", "utf-8")
    gold_predicates = [[], ]
    for line in fin:
        line = line.strip()
        if not line:
            gold_predicates.append([])
        else:
            info = line.split()
            gold_predicates[-1].append(info[0])
    fin.close()
    return gold_predicates


def print_to_conll(sentences, srl_labels, output_filename, gold_predicates=None):
    fout = codecs.open(output_filename, "w", "utf-8")
    for sent_id, words in enumerate(sentences):
        if gold_predicates:
            assert len(gold_predicates[sent_id]) == len(words)
        pred_to_args = srl_labels[sent_id]
        props = ["-" for _ in words]
        col_labels = [["*" for _ in words] for _ in range(len(pred_to_args))]
        for i, pred_id in enumerate(sorted(pred_to_args.keys())):
            # To make sure CoNLL-eval script count matching predicates as correct.
            if gold_predicates and gold_predicates[sent_id][pred_id] != "-":
                props[pred_id] = gold_predicates[sent_id][pred_id]
            else:
                props[pred_id] = "P" + words[pred_id]
            flags = [False for _ in words]
            for start, end, label in pred_to_args[pred_id]:
                if not max(flags[start:end + 1]):
                    col_labels[i][start] = "(" + label + col_labels[i][start]
                    col_labels[i][end] = col_labels[i][end] + ")"
                    for j in range(start, end + 1):
                        flags[j] = True
            # Add unpredicted verb (for predicted SRL).
            if not flags[pred_id]:  # if the predicate id is False
                col_labels[i][pred_id] = "(V*)"
        print_sentence_to_conll(fout, props, col_labels)
    fout.close()


================================================
FILE: hanlp/components/srl/span_rank/util.py
================================================
# Adopted from https://github.com/KiroSummer/A_Syntax-aware_MTL_Framework_for_Chinese_SRL
import torch


def block_orth_normal_initializer(input_size, output_size):
    weight = []
    for o in output_size:
        for i in input_size:
            param = torch.FloatTensor(o, i)
            torch.nn.init.orthogonal_(param)
            weight.append(param)
    return torch.cat(weight)


================================================
FILE: hanlp/components/sts/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-20 17:02


================================================
FILE: hanlp/components/sts/transformer_sts.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-20 17:03
import logging
from typing import Union, List
import torch
from torch.utils.data import DataLoader
from hanlp.common.structure import History
from hanlp.layers.transformers.pt_imports import AutoConfig_, AutoTokenizer_
from transformers import AutoModelForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput
from hanlp.common.dataset import SortingSamplerBuilder, PadSequenceDataLoader
from hanlp.common.torch_component import TorchComponent
from hanlp.datasets.sts.stsb import SemanticTextualSimilarityDataset
from hanlp.layers.transformers.utils import build_optimizer_scheduler_with_transformer
from hanlp.metrics.spearman_correlation import SpearmanCorrelation
from hanlp.transform.transformer_tokenizer import TransformerTextTokenizer
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import merge_locals_kwargs, reorder
from hanlp_common.constant import IDX


class TransformerSemanticTextualSimilarity(TorchComponent):

    def __init__(self, **kwargs) -> None:
        """
        A simple Semantic Textual Similarity (STS) baseline which fine-tunes a transformer with a regression layer on
        top of it.

        Args:
            **kwargs: Predefined config.
        """
        super().__init__(**kwargs)
        self._tokenizer = None

    # noinspection PyMethodOverriding
    def build_dataloader(self, data, batch_size, sent_a_col=None,
                         sent_b_col=None,
                         similarity_col=None,
                         delimiter='auto',
                         gradient_accumulation=1,
                         sampler_builder=None,
                         shuffle=False, device=None, logger: logging.Logger = None,
                         split=None,
                         **kwargs) -> DataLoader:
        dataset = SemanticTextualSimilarityDataset(data,
                                                   sent_a_col,
                                                   sent_b_col,
                                                   similarity_col,
                                                   delimiter=delimiter,
                                                   transform=self._tokenizer,
                                                   cache=isinstance(data, str))
        if split == 'trn':
            scores = [x['similarity'] for x in dataset]
            self.config.max_score = max(scores)
            self.config.min_score = min(scores)
        if not sampler_builder:
            sampler_builder = SortingSamplerBuilder(batch_size=batch_size)
        lens = [len(x['input_ids']) for x in dataset]
        return PadSequenceDataLoader(dataset, batch_sampler=sampler_builder.build(lens, shuffle, gradient_accumulation),
                                     device=device,
                                     pad={'similarity': 0.0, 'input_ids': self._tokenizer.tokenizer.pad_token_id})

    def build_optimizer(self, trn, epochs, gradient_accumulation=1, lr=1e-3, transformer_lr=5e-5, adam_epsilon=1e-8,
                        weight_decay=0.0, warmup_steps=0.1, **kwargs):
        num_training_steps = len(trn) * epochs // gradient_accumulation
        optimizer, scheduler = build_optimizer_scheduler_with_transformer(self.model,
                                                                          self.model.base_model,
                                                                          lr, transformer_lr,
                                                                          num_training_steps, warmup_steps,
                                                                          weight_decay, adam_epsilon)
        return optimizer, scheduler

    def build_criterion(self, **kwargs):
        pass

    def build_metric(self, **kwargs):
        return SpearmanCorrelation()

    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
                              logger: logging.Logger, devices, ratio_width=None, gradient_accumulation=1, **kwargs):
        best_epoch, best_metric = 0, -1
        timer = CountdownTimer(epochs)
        history = History()
        for epoch in range(1, epochs + 1):
            logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
            self.fit_dataloader(trn, criterion, optimizer, metric, logger, ratio_width=ratio_width,
                                gradient_accumulation=gradient_accumulation, history=history, save_dir=save_dir)
            report = f'{timer.elapsed_human}/{timer.total_time_human}'
            self.evaluate_dataloader(dev, logger, ratio_width=ratio_width, save_dir=save_dir, metric=metric)
            if metric > best_metric:
                self.save_weights(save_dir)
                best_metric = float(metric)
                best_epoch = epoch
                report += ' [red]saved[/red]'
            timer.log(report, ratio_percentage=False, newline=True, ratio=False)
        if best_epoch and best_epoch != epochs:
            logger.info(f'Restored the best model with {best_metric} saved {epochs - best_epoch} epochs ago')
            self.load_weights(save_dir)

    def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric: SpearmanCorrelation, logger: logging.Logger,
                       history=None, gradient_accumulation=1, **kwargs):
        self.model.train()
        optimizer, scheduler = optimizer
        timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation))
        total_loss = 0
        metric.reset()
        for batch in trn:
            output = self.feed_batch(batch)
            prediction = self.decode(output)
            metric(prediction, batch['similarity'])
            loss = output['loss']
            if gradient_accumulation and gradient_accumulation > 1:
                loss /= gradient_accumulation
            loss.backward()
            total_loss += loss.item()
            if history.step(gradient_accumulation):
                if self.config.grad_norm:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm)
                optimizer.step()
                if scheduler:
                    scheduler.step()
                optimizer.zero_grad()
                timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
                          logger=logger)
            del loss
        return total_loss / timer.total

    @torch.no_grad()
    def evaluate_dataloader(self, data: DataLoader, logger: logging.Logger, metric=None, output=False, **kwargs):
        self.model.eval()
        timer = CountdownTimer(len(data))
        total_loss = 0
        metric.reset()
        if output:
            predictions = []
            orders = []
            samples = []
        for batch in data:
            output_dict = self.feed_batch(batch)
            prediction = self.decode(output_dict)
            metric(prediction, batch['similarity'])
            if output:
                predictions.extend(prediction.tolist())
                orders.extend(batch[IDX])
                samples.extend(list(zip(batch['sent_a'], batch['sent_b'])))
            loss = output_dict['loss']
            total_loss += loss.item()
            timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
                      logger=logger)
            del loss
        if output:
            predictions = reorder(predictions, orders)
            samples = reorder(samples, orders)
            with open(output, 'w') as out:
                for s, p in zip(samples, predictions):
                    out.write('\t'.join(s + (str(p),)))
                    out.write('\n')
        return total_loss / timer.total

    # noinspection PyMethodOverriding
    def build_model(self, transformer, training=True, **kwargs) -> torch.nn.Module:
        config = AutoConfig_.from_pretrained(transformer, num_labels=1)
        if training:
            model = AutoModelForSequenceClassification.from_pretrained(transformer, config=config)
        else:
            model = AutoModelForSequenceClassification.from_config(config)
        return model

    def predict(self, data: Union[List[str], List[List[str]]], batch_size: int = None, **kwargs) -> Union[
        float, List[float]]:
        """ Predict the similarity between sentence pairs.

        Args:
            data: Sentence pairs.
            batch_size: The number of samples in a batch.
            **kwargs: Not used.

        Returns:
            Similarities between sentences.
        """
        if not data:
            return []
        flat = isinstance(data[0], str)
        if flat:
            data = [data]
        dataloader = self.build_dataloader([{'sent_a': x[0], 'sent_b': x[1]} for x in data],
                                           batch_size=batch_size or self.config.batch_size,
                                           device=self.device)
        orders = []
        predictions = []
        for batch in dataloader:
            output_dict = self.feed_batch(batch)
            prediction = self.decode(output_dict)
            predictions.extend(prediction.tolist())
            orders.extend(batch[IDX])
        predictions = reorder(predictions, orders)
        if flat:
            return predictions[0]
        return predictions

    # noinspection PyMethodOverriding
    def fit(self, trn_data, dev_data, save_dir,
            transformer,
            sent_a_col,
            sent_b_col,
            similarity_col,
            delimiter='auto',
            batch_size=32,
            max_seq_len=128,
            epochs=3,
            lr=1e-3,
            transformer_lr=5e-5,
            adam_epsilon=1e-8,
            weight_decay=0.0,
            warmup_steps=0.1,
            gradient_accumulation=1,
            grad_norm=1.0,
            sampler_builder=None,
            devices=None,
            logger=None,
            seed=None,
            finetune: Union[bool, str] = False, eval_trn=True, _device_placeholder=False, **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def on_config_ready(self, transformer, max_seq_len, **kwargs):
        super().on_config_ready(**kwargs)
        self._tokenizer = TransformerTextTokenizer(AutoTokenizer_.from_pretrained(transformer),
                                                   text_a_key='sent_a',
                                                   text_b_key='sent_b',
                                                   output_key='',
                                                   max_seq_length=max_seq_len)

    def feed_batch(self, batch) -> SequenceClassifierOutput:
        return self.model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'],
                          token_type_ids=batch['token_type_ids'], labels=batch.get('similarity', None))

    def decode(self, output: SequenceClassifierOutput):
        return output.logits.squeeze(-1).detach().clip(self.config.min_score, self.config.max_score)

    def report_metrics(self, loss, metric):
        return f'loss: {loss:.4f} {metric}'


================================================
FILE: hanlp/components/taggers/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-28 15:39

================================================
FILE: hanlp/components/taggers/cnn_tagger_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-31 13:52
from abc import ABC
from typing import Union, Tuple, Any, List, Iterable

import tensorflow as tf

from hanlp.components.taggers.tagger_tf import TaggerComponent
from hanlp.transform.tsv_tf import TSVTaggingTransform
from hanlp.common.vocab_tf import VocabTF
from hanlp.layers.embeddings.util_tf import build_embedding


class WindowTokenTransform(TSVTaggingTransform):

    def fit(self, trn_path: str, **kwargs):
        self.word_vocab = VocabTF()
        self.tag_vocab = VocabTF(pad_token=None, unk_token=None)
        for ngrams, tags in self.file_to_samples(trn_path):
            for words in ngrams:
                self.word_vocab.update(words)
            self.tag_vocab.update(tags)

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        window_radius = self.config.window_radius
        window_size = 2 * window_radius + 1
        types = tf.string, tf.string
        shapes = [None, window_size], [None]
        values = self.word_vocab.pad_token, self.tag_vocab.first_token
        return types, shapes, values

    def inputs_to_samples(self, inputs, gold=False):
        window_radius = self.config.window_radius
        for t in inputs:
            if gold:
                words, tags = t
            else:
                words, tags = t, [self.padding_values[-1]] * len(t)
            ngrams = []
            for i, word in enumerate(words):
                features = []
                for t in range(-window_radius, window_radius + 1):
                    index = i + t
                    if index < 0:
                        feature = 'bos{}'.format(index)
                    elif index >= len(words):
                        feature = 'eos+{}'.format(index - len(words) + 1)
                    else:
                        feature = words[index]
                    features.append(feature)
                ngrams.append(features)
            yield ngrams, tags

    def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
        for xs in X:
            words = []
            for x in xs:
                words.append(self.word_vocab.idx_to_token[int(x[len(x) // 2])])
            yield words


class CNNTaggingModel(tf.keras.models.Model):
    def __init__(self, filters, num_tags, embed, dropout, kernels, **kwargs):
        super().__init__()
        self.embed = embed
        self.embed_dropout = tf.keras.layers.Dropout(rate=dropout)
        self.conv2d = []
        for k in kernels:
            self.conv2d.append(
                tf.keras.layers.Conv2D(filters=filters, kernel_size=k, data_format='channels_last', padding='same'))
        self.conv2d_dropout = tf.keras.layers.Dropout(rate=dropout)
        self.concat = tf.keras.layers.Concatenate()
        self.dense = tf.keras.layers.Dense(units=num_tags)

    def call(self, inputs, **kwargs):
        # if inputs.shape_h[0] is None:
        #     return tf.zeros_like()
        #     print(inputs)
        embeds = self.embed(inputs)
        embeds = self.embed_dropout(embeds)
        hs = [conv(embeds) for conv in self.conv2d]
        h = self.concat(hs)
        h = self.conv2d_dropout(h)
        shape_h = tf.shape(h)
        h = tf.reshape(h, [shape_h[0], shape_h[1], h.shape[2] * h.shape[3]])
        o = self.dense(h)
        if h.shape[0]:
            mask = embeds._keras_mask[:, :, 0]
            o._keras_mask = mask
        return o


class CNNTaggerTF(TaggerComponent, ABC):
    def __init__(self, transform: WindowTokenTransform = None) -> None:
        if not transform:
            transform = WindowTokenTransform()
        super().__init__(transform)
        self.model: CNNTaggingModel = self.model  # refine the type
        self.transform: WindowTokenTransform = self.transform

    def build_model(self, embedding, **kwargs) -> tf.keras.Model:
        embed = build_embedding(embedding, self.transform.word_vocab, self.transform)
        self.transform.map_x = embed.dtype != tf.string
        model = CNNTaggingModel(num_tags=len(self.transform.tag_vocab),
                                embed=embed,
                                **kwargs)
        # model.build((None, None, 3))
        return model

    # noinspection PyMethodOverriding
    def fit(self, trn_data: Any, dev_data: Any, save_dir: str, embedding=200, window_radius=3,
            kernels=(1, 2, 3, 4, 5), filters=200, dropout=0.3,
            loss: Union[tf.keras.losses.Loss, str] = None,
            optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='accuracy', batch_size=100,
            epochs=100,
            logger=None, verbose=True, **kwargs):
        kwargs.update(locals())
        for k in 'self', 'kwargs', '__class__':
            kwargs.pop(k)
        super().fit(**kwargs)

    @property
    def input_shape(self) -> List:
        return [[None, None, self.config.window_radius * 2 + 1]]


================================================
FILE: hanlp/components/taggers/ngram_conv/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 22:18

================================================
FILE: hanlp/components/taggers/ngram_conv/ngram_conv_tagger.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-25 00:04

from typing import Union, Optional, Tuple, Any, Iterable, List

import tensorflow as tf

from hanlp_common.structure import SerializableDict
from hanlp.components.taggers.tagger_tf import TaggerComponent
from hanlp.transform.tsv_tf import TSVTaggingTransform
from hanlp.transform.txt_tf import bmes_to_words, extract_ngram_features
from hanlp.common.vocab_tf import VocabTF
from hanlp.layers.embeddings.util_tf import build_embedding
from hanlp.layers.weight_normalization import WeightNormalization
from hanlp_common.util import merge_locals_kwargs


class NgramTransform(TSVTaggingTransform):

    def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, **kwargs) -> None:
        super().__init__(config, map_x, map_y, **kwargs)
        self.ngram_vocab: Optional[VocabTF] = None
        self.tag_vocab: Optional[VocabTF] = None

    def inputs_to_samples(self, inputs, gold=False):
        for data in inputs:
            if gold:
                words, tags = data
            else:
                words, tags = data, [self.tag_vocab.safe_pad_token] * len(data)
            features = [words]
            if not tags:
                tags = [self.tag_vocab.first_token] * len(words)
            features.extend(extract_ngram_features(words, False, self.config.window_size))
            yield tuple(features), tags

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        ids = [self.word_vocab.lookup(x[0]) if self.config.map_word_feature else x[0]]
        for ngram in x[1:]:
            ids.append(self.ngram_vocab.lookup(ngram))
        return tuple(ids)

    def y_to_idx(self, y) -> tf.Tensor:
        return self.tag_vocab.lookup(y)

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        window_size = self.config.window_size
        ngram_size = window_size * (window_size + 1) // 2
        vec_dim = 2 + ngram_size
        shapes = tuple([[None]] * (vec_dim - 1)), [None]
        types = tuple([tf.string] * (vec_dim - 1)), tf.string
        word_vocab, ngram_vocab, tag_vocab = self.word_vocab, self.ngram_vocab, self.tag_vocab
        defaults = tuple([word_vocab.pad_token] + [
            ngram_vocab.pad_token if ngram_vocab else word_vocab.pad_token] * ngram_size), (
                       tag_vocab.pad_token if tag_vocab.pad_token else tag_vocab.first_token)
        return types, shapes, defaults

    def fit(self, trn_path: str, **kwargs):
        word_vocab, ngram_vocab, tag_vocab = VocabTF(), VocabTF(), VocabTF(pad_token=None, unk_token=None)
        num_samples = 0
        for X, Y in self.file_to_samples(trn_path, gold=True):
            num_samples += 1
            word_vocab.update(X[0])
            for ngram in X[1:]:
                ngram_vocab.update(filter(lambda x: x, ngram))
            tag_vocab.update(Y)
        self.word_vocab, self.ngram_vocab, self.tag_vocab = word_vocab, ngram_vocab, tag_vocab
        if self.config.window_size:
            vocabs = word_vocab, ngram_vocab, tag_vocab
        else:
            vocabs = word_vocab, None, tag_vocab
        self.word_vocab, self.ngram_vocab, self.tag_vocab = vocabs
        return num_samples

    def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
        yield from super().X_to_inputs(X[0])

    def input_truth_output_to_str(self, input: List[str], truth: List[str], output: List[str]):
        words = bmes_to_words(input, output)
        return ' '.join(words)


class NgramConvTaggingModel(tf.keras.models.Model):
    def __init__(self, word_embed: tf.keras.layers.Embedding, ngram_embed: tf.keras.layers.Embedding, filters,
                 kernel_size, dropout_embed, dropout_hidden, weight_norm, num_tags, **kwargs):
        super().__init__(**kwargs)
        if ngram_embed is not None:
            self.ngram_embed = ngram_embed
        self.word_embed = word_embed
        # self.concat = tf.keras.layers.Concatenate(axis=2)
        self.dropout_embed = tf.keras.layers.Dropout(rate=dropout_embed)
        self.filters_w = []
        self.filters_v = []

        def create_conv1d(filter, name):
            conv = tf.keras.layers.Conv1D(filter, kernel_size, padding="same", name=name)
            if weight_norm:
                conv_norm = WeightNormalization(conv, name=name + '_norm', data_init=False)
                return conv_norm
            return conv

        for idx, filter in enumerate(filters):
            self.filters_w.append(create_conv1d(filter, 'Conv1Dw_{}'.format(idx)))
            self.filters_v.append(create_conv1d(filter, 'Conv1Dv_{}'.format(idx)))
        self.dropout_hidden = tf.keras.layers.Dropout(rate=dropout_hidden)
        self.dense = tf.keras.layers.Dense(num_tags, use_bias=False)

    def call(self, inputs, **kwargs):
        if hasattr(self, 'ngram_embed'):
            chars, ngrams = inputs[0], inputs[1:]
            embeds = [self.word_embed(chars)]
            mask = embeds[0]._keras_mask
            for ngram in ngrams:
                embeds.append(self.ngram_embed(ngram))
            if len(embeds) > 1:
                embed_input = tf.concat(embeds, axis=2)
            else:
                embed_input = embeds[0]
        else:
            chars = inputs if isinstance(inputs, tf.Tensor) else inputs[0]
            embed_input = self.word_embed(chars)
            mask = embed_input._keras_mask

        mask_float = tf.dtypes.cast(mask, tf.float32)
        embed_input = self.dropout_embed(embed_input)
        hidden_output = embed_input
        for fw, fv in zip(self.filters_w.layers, self.filters_v.layers):
            w = fw(hidden_output)
            v = fv(hidden_output)
            hidden_output = w * tf.nn.sigmoid(v)
            # Mask paddings.
            hidden_output = hidden_output * tf.expand_dims(mask_float, -1)
            hidden_output = self.dropout_hidden(hidden_output)
        # dirty hack
        hidden_output._keras_mask = mask
        logits = self.dense(hidden_output)
        return logits


class NgramConvTaggerTF(TaggerComponent):

    def __init__(self, transform: NgramTransform = None) -> None:
        if not transform:
            transform = NgramTransform()
        super().__init__(transform)
        self.transform: NgramTransform = transform

    def build_model(self, word_embed, ngram_embed, window_size, weight_norm, filters, kernel_size, dropout_embed,
                    dropout_hidden, **kwargs) -> tf.keras.Model:
        word_vocab, ngram_vocab, tag_vocab = self.transform.word_vocab, self.transform.ngram_vocab, \
                                             self.transform.tag_vocab
        word_embed = build_embedding(word_embed, word_vocab, self.transform)
        if 'map_x' in self.config:
            self.config.map_word_feature = self.config.map_x
            del self.config.map_x
        else:
            self.config.map_word_feature = True
        if window_size:
            ngram_embed = build_embedding(ngram_embed, ngram_vocab, self.transform)
        else:
            ngram_embed = None
        model = NgramConvTaggingModel(word_embed, ngram_embed, filters, kernel_size, dropout_embed, dropout_hidden,
                                      weight_norm, len(tag_vocab))

        return model

    def fit(self, trn_data: Any, dev_data: Any, save_dir: str, word_embed: Union[str, int, dict] = 200,
            ngram_embed: Union[str, int,dict] = 50, embedding_trainable=True, window_size=4, kernel_size=3,
            filters=(200, 200, 200, 200, 200), dropout_embed=0.2, dropout_hidden=0.2, weight_norm=True,
            loss: Union[tf.keras.losses.Loss, str] = None,
            optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='accuracy', batch_size=100,
            epochs=100,
            logger=None, verbose=True, **kwargs):
        assert kwargs.get('run_eagerly', True), 'NgramConvTaggingModel can only run eagerly'
        kwargs['run_eagerly'] = True
        return super().fit(**merge_locals_kwargs(locals(), kwargs))


================================================
FILE: hanlp/components/taggers/pos_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-05 23:05
from hanlp.components.taggers.cnn_tagger_tf import CNNTaggerTF
from hanlp.components.taggers.rnn_tagger_tf import RNNTaggerTF


class CNNPartOfSpeechTaggerTF(CNNTaggerTF):
    pass


class RNNPartOfSpeechTaggerTF(RNNTaggerTF):
    pass


================================================
FILE: hanlp/components/taggers/rnn/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-19 15:41

================================================
FILE: hanlp/components/taggers/rnn/rnntaggingmodel.py
================================================
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from typing import Union

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_sequence, pad_packed_sequence
from hanlp.layers.crf.crf import CRF


class RNNTaggingModel(nn.Module):

    def __init__(self,
                 embed: Union[nn.Embedding, int],
                 rnn_input,
                 rnn_hidden,
                 n_out,
                 drop=0.5,
                 crf=True,
                 crf_constraints=None):
        super(RNNTaggingModel, self).__init__()

        # the embedding layer
        if isinstance(embed, nn.Module):
            self.embed = embed
            n_embed = embed.embedding_dim
        else:
            self.embed = None
            n_embed = embed

        if rnn_input:
            self.embed_to_rnn = nn.Linear(n_embed, rnn_input)
        else:
            self.embed_to_rnn = None
            rnn_input = n_embed

        # the word-lstm layer
        self.word_lstm = nn.LSTM(input_size=rnn_input,
                                 hidden_size=rnn_hidden,
                                 batch_first=True,
                                 bidirectional=True)

        # the output layer
        self.out = nn.Linear(rnn_hidden * 2, n_out)
        # the CRF layer
        self.crf = CRF(n_out, crf_constraints) if crf else None

        self.drop = nn.Dropout(drop)
        # self.drop = SharedDropout(drop)
        # self.drop = LockedDropout(drop)

        self.reset_parameters()

    def reset_parameters(self):
        # init Linear
        nn.init.xavier_uniform_(self.out.weight)

    def forward(self,
                x: torch.Tensor,
                batch=None,
                **kwargs):
        # get the mask and lengths of given batch
        mask = x.gt(0)
        lens = mask.sum(dim=1)
        # get outputs from embedding layers
        if isinstance(self.embed, nn.Embedding):
            x = self.embed(x[mask])
        else:
            x = self.embed(batch, mask=mask)
            if x.dim() == 3:
                x = x[mask]
        x = self.drop(x)
        if self.embed_to_rnn:
            x = self.embed_to_rnn(x)
        x = pack_sequence(torch.split(x, lens.tolist()), True)
        x, _ = self.word_lstm(x)
        x, _ = pad_packed_sequence(x, True)
        x = self.drop(x)

        return self.out(x), mask


================================================
FILE: hanlp/components/taggers/rnn_tagger.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-20 13:12
import logging

import torch
from torch import nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader

from hanlp.common.dataset import PadSequenceDataLoader, SortingSampler, TransformableDataset
from hanlp_common.configurable import Configurable
from hanlp.common.transform import EmbeddingNamedTransform
from hanlp.common.vocab import Vocab
from hanlp.components.taggers.rnn.rnntaggingmodel import RNNTaggingModel
from hanlp.components.taggers.tagger import Tagger
from hanlp.datasets.ner.loaders.tsv import TSVTaggingDataset
from hanlp.layers.embeddings.embedding import Embedding
from hanlp.layers.embeddings.util import build_word2vec_with_vocab
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import merge_locals_kwargs, merge_dict


class RNNTagger(Tagger):

    def __init__(self, **kwargs) -> None:
        """An old-school tagger using non-contextualized embeddings and RNNs as context layer.

        Args:
            **kwargs: Predefined config.
        """
        super().__init__(**kwargs)
        self.model: RNNTaggingModel = None

    # noinspection PyMethodOverriding
    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion,
                              optimizer,
                              metric,
                              save_dir,
                              logger,
                              patience,
                              **kwargs):
        max_e, max_metric = 0, -1

        criterion = self.build_criterion()
        timer = CountdownTimer(epochs)
        ratio_width = len(f'{len(trn)}/{len(trn)}')
        scheduler = self.build_scheduler(**merge_dict(self.config, optimizer=optimizer, overwrite=True))
        if not patience:
            patience = epochs
        for epoch in range(1, epochs + 1):
            logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
            self.fit_dataloader(trn, criterion, optimizer, metric, logger, ratio_width=ratio_width)
            loss, dev_metric = self.evaluate_dataloader(dev, criterion, logger)
            if scheduler:
                if isinstance(scheduler, ReduceLROnPlateau):
                    scheduler.step(dev_metric.score)
                else:
                    scheduler.step(epoch)
            report_patience = f'Patience: {epoch - max_e}/{patience}'
            # save the model if it is the best so far
            if dev_metric > max_metric:
                self.save_weights(save_dir)
                max_e, max_metric = epoch, dev_metric
                report_patience = '[red]Saved[/red] '
            stop = epoch - max_e >= patience
            if stop:
                timer.stop()
            timer.log(f'{report_patience} lr: {optimizer.param_groups[0]["lr"]:.4f}',
                      ratio_percentage=False, newline=True, ratio=False)
            if stop:
                break
        timer.stop()
        if max_e != epoch:
            self.load_weights(save_dir)
        logger.info(f"Max score of dev is {max_metric.score:.2%} at epoch {max_e}")
        logger.info(f"{timer.elapsed_human} elapsed, average time of each epoch is {timer.elapsed_average_human}")

    def build_scheduler(self, optimizer, anneal_factor, anneal_patience, **kwargs):
        scheduler: ReduceLROnPlateau = ReduceLROnPlateau(optimizer,
                                                         factor=anneal_factor,
                                                         patience=anneal_patience,
                                                         mode='max') if anneal_factor and anneal_patience else None
        return scheduler

    def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, ratio_width=None,
                       **kwargs):
        self.model.train()
        timer = CountdownTimer(len(trn))
        total_loss = 0
        for idx, batch in enumerate(trn):
            optimizer.zero_grad()
            out, mask = self.feed_batch(batch)
            y = batch['tag_id']
            loss = self.compute_loss(criterion, out, y, mask)
            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)
            optimizer.step()
            total_loss += loss.item()
            prediction = self.decode_output(out, mask, batch)
            self.update_metrics(metric, out, y, mask, batch, prediction)
            timer.log(f'loss: {loss / (idx + 1):.4f} {metric}', ratio_percentage=False, logger=logger,
                      ratio_width=ratio_width)
            del loss
            del out
            del mask

    def feed_batch(self, batch):
        x = batch[f'{self.config.token_key}_id']
        out, mask = self.model(x, **batch, batch=batch)
        return out, mask

    # noinspection PyMethodOverriding
    def build_model(self, rnn_input, rnn_hidden, drop, crf, **kwargs) -> torch.nn.Module:
        vocabs = self.vocabs
        token_embed = self._convert_embed()
        if isinstance(token_embed, EmbeddingNamedTransform):
            token_embed = token_embed.output_dim
        elif isinstance(token_embed, Embedding):
            token_embed = token_embed.module(vocabs=vocabs)
        else:
            token_embed = build_word2vec_with_vocab(token_embed, vocabs[self.config.token_key])
        model = RNNTaggingModel(token_embed, rnn_input, rnn_hidden, len(vocabs['tag']), drop, crf)
        return model

    def _convert_embed(self):
        embed = self.config['embed']
        if isinstance(embed, dict):
            self.config['embed'] = embed = Configurable.from_config(embed)
        return embed

    def build_dataloader(self, data, batch_size, shuffle, device, logger=None, **kwargs) -> DataLoader:
        vocabs = self.vocabs
        token_embed = self._convert_embed()
        dataset = data if isinstance(data, TransformableDataset) else self.build_dataset(data, transform=[vocabs])
        if vocabs.mutable:
            # Before building vocabs, let embeddings submit their vocabs, some embeddings will possibly opt out as their
            # transforms are not relevant to vocabs
            if isinstance(token_embed, Embedding):
                transform = token_embed.transform(vocabs=vocabs)
                if transform:
                    dataset.transform.insert(-1, transform)
            self.build_vocabs(dataset, logger)
        if isinstance(token_embed, Embedding):
            # Vocabs built, now add all transforms to the pipeline. Be careful about redundant ones.
            transform = token_embed.transform(vocabs=vocabs)
            if transform and transform not in dataset.transform:
                dataset.transform.insert(-1, transform)
        sampler = SortingSampler([len(sample[self.config.token_key]) for sample in dataset], batch_size,
                                 shuffle=shuffle)
        return PadSequenceDataLoader(dataset,
                                     device=device,
                                     batch_sampler=sampler,
                                     vocabs=vocabs)

    def build_dataset(self, data, transform):
        return TSVTaggingDataset(data, transform)

    def build_vocabs(self, dataset, logger):
        self.vocabs.tag = Vocab(unk_token=None, pad_token=None)
        self.vocabs[self.config.token_key] = Vocab()
        for each in dataset:
            pass
        self.vocabs.lock()
        self.vocabs.summary(logger)

    def fit(self, trn_data, dev_data, save_dir,
            batch_size=50,
            epochs=100,
            embed=100,
            rnn_input=None,
            rnn_hidden=256,
            drop=0.5,
            lr=0.001,
            patience=10,
            crf=True,
            optimizer='adam',
            token_key='token',
            tagging_scheme=None,
            anneal_factor: float = 0.5,
            anneal_patience=2,
            devices=None, logger=None, verbose=True, **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def _id_to_tags(self, ids):
        batch = []
        vocab = self.vocabs['tag'].idx_to_token
        for b in ids:
            batch.append([])
            for i in b:
                batch[-1].append(vocab[i])
        return batch

    def write_output(self, yhat, y, mask, batch, prediction, output):
        pass


================================================
FILE: hanlp/components/taggers/rnn_tagger_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-09-14 20:30
from typing import Union, List

import tensorflow as tf

from hanlp.common.transform_tf import Transform
from hanlp.components.taggers.tagger_tf import TaggerComponent
from hanlp.transform.tsv_tf import TSVTaggingTransform
from hanlp.common.vocab_tf import VocabTF
from hanlp.layers.embeddings.util_tf import build_embedding, embeddings_require_string_input, \
    embeddings_require_char_input
from hanlp_common.util import merge_locals_kwargs


class RNNTaggerTF(TaggerComponent):

    def __init__(self, transform: Transform = None) -> None:
        if not transform:
            self.transform = transform = TSVTaggingTransform()
        super().__init__(transform)

    def fit(self, trn_data: str, dev_data: str = None, save_dir: str = None, embeddings=100, embedding_trainable=False,
            rnn_input_dropout=0.2, rnn_units=100, rnn_output_dropout=0.2, epochs=20, lower=False, logger=None,
            loss: Union[tf.keras.losses.Loss, str] = None,
            optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='accuracy',
            batch_size=32, dev_batch_size=32, lr_decay_per_epoch=None, verbose=True, **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def build_model(self, embeddings, embedding_trainable, rnn_input_dropout, rnn_output_dropout, rnn_units,
                    loss,
                    **kwargs) -> tf.keras.Model:
        model = tf.keras.Sequential()
        embeddings = build_embedding(embeddings, self.transform.word_vocab, self.transform)
        model.add(embeddings)
        if rnn_input_dropout:
            model.add(tf.keras.layers.Dropout(rnn_input_dropout, name='rnn_input_dropout'))
        model.add(
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=rnn_units, return_sequences=True), name='bilstm'))
        if rnn_output_dropout:
            model.add(tf.keras.layers.Dropout(rnn_output_dropout, name='rnn_output_dropout'))
        model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(self.transform.tag_vocab)), name='dense'))
        return model

    def predict(self, sents: Union[List[str], List[List[str]]], batch_size=32, **kwargs) -> Union[
        List[str], List[List[str]]]:
        return super().predict(sents, batch_size)

    def save_weights(self, save_dir, filename='model.h5'):
        # remove the pre-trained embedding
        embedding_layer: tf.keras.layers.Embedding = self.model.get_layer(index=0)
        if embedding_layer.trainable:
            super().save_weights(save_dir, filename)
        else:
            truncated_model = tf.keras.Sequential(layers=self.model.layers[1:])
            truncated_model.build(input_shape=embedding_layer.output_shape)
            truncated_model.save_weights(save_dir)

    def build_loss(self, loss, **kwargs):
        if not loss:
            loss = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.SUM,
                                                                 from_logits=True)
            return loss
        return super().build_loss(loss, **kwargs)

    @property
    def tag_vocab(self) -> VocabTF:
        return self.transform.tag_vocab

    def build_transform(self, embeddings, **kwargs):
        if embeddings_require_string_input(embeddings):
            self.transform.map_x = False
            if embeddings_require_char_input(embeddings):
                self.transform.char_vocab = VocabTF()
        return super().build_transform(**kwargs)

    @property
    def sample_data(self):
        if self.transform.char_vocab:
            # You cannot build your model by calling `build` if your layers do not support float type inputs.
            # Instead, in order to instantiate and build your model, `call` your model on real tensor data (of the
            # correct dtype).
            sample = tf.constant([
                ['hello', 'world', self.transform.word_vocab.pad_token],
                ['hello', 'this', 'world'],
            ])
            sample._keras_mask = tf.not_equal(sample, self.transform.word_vocab.pad_token)
            return sample


================================================
FILE: hanlp/components/taggers/tagger.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-11 12:19
import logging
import warnings
from abc import ABC, abstractmethod
from typing import List, TextIO, Any, Union, Dict, Tuple, Sequence

import torch
from torch import optim, nn
from torch.utils.data import DataLoader

from hanlp_common.constant import IDX
from hanlp.common.structure import History
from hanlp.components.distillation.distillable_component import DistillableComponent
from hanlp.components.taggers.util import guess_tagging_scheme
from hanlp.layers.crf.crf import CRF
from hanlp.metrics.accuracy import CategoricalAccuracy
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import reorder
from hanlp_trie import DictInterface, TrieDict
from hanlp_trie.dictionary import TupleTrieDict


class Tagger(DistillableComponent, ABC):
    def build_optimizer(self, optimizer, lr, **kwargs):
        if optimizer == 'adam':
            return optim.Adam(params=self.model.parameters(), lr=lr)
        elif optimizer == 'sgd':
            return torch.optim.SGD(self.model.parameters(), lr=lr)

    def build_criterion(self, model=None, reduction='mean', decoder=None, **kwargs):
        if self.config.get('crf', False):
            if not model:
                model = decoder or self.model
            if isinstance(model, nn.DataParallel):
                raise ValueError('DataParallel not supported when CRF is used')
                return self.model_from_config.module.crf
            return model.crf
        else:
            return nn.CrossEntropyLoss(reduction=reduction)

    def build_metric(self, **kwargs):
        return CategoricalAccuracy()

    @abstractmethod
    def feed_batch(self, batch):
        pass

    def compute_loss(self, criterion, out, y, mask):
        if self.config.get('crf', False):
            criterion: CRF = criterion
            loss = -criterion.forward(out, y, mask)
        else:
            loss = criterion(out[mask], y[mask])
        return loss

    def decode_output(self, logits, mask, batch, model=None):
        if self.config.get('crf', False):
            if model is None:
                model = self.model
            crf: CRF = model.crf
            return crf.decode(logits, mask)
        else:
            return logits.argmax(-1)

    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
                              logger: logging.Logger, devices, ratio_width=None, patience=5, teacher=None,
                              kd_criterion=None, eval_trn=True,
                              **kwargs):
        best_epoch, best_metric = 0, -1
        timer = CountdownTimer(epochs)
        history = History()
        for epoch in range(1, epochs + 1):
            logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
            self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history, ratio_width=ratio_width,
                                eval_trn=eval_trn, **self.config)
            loss, dev_metric = self.evaluate_dataloader(dev, criterion, logger=logger, ratio_width=ratio_width)
            timer.update()
            report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}"
            if dev_metric > best_metric:
                best_epoch, best_metric = epoch, dev_metric
                self.save_weights(save_dir)
                report += ' [red](saved)[/red]'
            else:
                report += f' ({epoch - best_epoch})'
                if epoch - best_epoch >= patience:
                    report += ' early stop'
            logger.info(report)
            if epoch - best_epoch >= patience:
                break
        if not best_epoch:
            self.save_weights(save_dir)
        elif best_epoch != epoch:
            self.load_weights(save_dir)
        logger.info(f"Max score of dev is {best_metric} at epoch {best_epoch}")
        logger.info(f"Average time of each epoch is {timer.elapsed_average_human}")
        logger.info(f"{timer.elapsed_human} elapsed")
        return best_metric

    def id_to_tags(self, ids: torch.LongTensor, lens: List[int]):
        batch = []
        vocab = self.vocabs['tag'].idx_to_token
        for b, l in zip(ids, lens):
            batch.append([])
            for i in b[:l]:
                batch[-1].append(vocab[i])
        return batch

    def update_metrics(self, metric, logits, y, mask, batch=None, prediction=None):
        metric(logits, y, mask)

    @torch.no_grad()
    def evaluate_dataloader(self, data, criterion, logger=None, ratio_width=None, metric=None, output=None, **kwargs):
        self.model.eval()
        if isinstance(output, str):
            output = open(output, 'w')

        loss = 0
        if not metric:
            metric = self.build_metric()
        else:
            metric.reset()
        timer = CountdownTimer(len(data))
        for idx, batch in enumerate(data):
            logits, mask = self.feed_batch(batch)
            y = batch['tag_id']
            loss += self.compute_loss(criterion, logits, y, mask).item()
            prediction = self.decode_output(logits, mask, batch)
            self.update_metrics(metric, logits, y, mask, batch, prediction)
            if output:
                self.write_prediction(prediction, batch, output)
            timer.log(f'loss: {loss / (idx + 1):.4f} {metric}', ratio_percentage=False, logger=logger,
                      ratio_width=ratio_width)
        loss /= len(data)
        if output:
            output.close()
        return float(loss), metric

    def write_prediction(self, prediction, batch, output: TextIO):
        for tokens, ps, gs in zip(batch[self.config.token_key], prediction, batch['tag']):
            output.write('\n'.join('\t'.join([t, p, g]) for t, p, g in zip(tokens, ps, gs)))
            output.write('\n')

    def predict(self, tokens: Any, batch_size: int = None, **kwargs):
        if not tokens:
            return []
        flat = self.input_is_flat(tokens)
        if flat:
            tokens = [tokens]
        outputs = self.predict_data(tokens, batch_size, **kwargs)
        if flat:
            return outputs[0]
        return outputs

    def input_is_flat(self, tokens):
        return isinstance(tokens, list) and isinstance(tokens[0], str)

    def predict_data(self, data, batch_size, sampler_builder=None, **kwargs):
        samples = self.build_samples(data, **kwargs)
        if not batch_size:
            batch_size = self.config.get('batch_size', 32)
        dataloader = self.build_dataloader(samples, batch_size, False, self.device, sampler_builder=sampler_builder,
                                           **kwargs)
        outputs = []
        orders = []
        vocab = self.vocabs['tag'].idx_to_token
        for batch in dataloader:
            out, mask = self.feed_batch(batch)
            pred = self.decode_output(out, mask, batch)
            outputs.extend(self.prediction_to_human(pred, vocab, batch))
            orders.extend(batch[IDX])
        outputs = reorder(outputs, orders)
        return outputs

    def build_samples(self, data: List[str], **kwargs):
        return [{self.config.token_key: sent} for sent in data]

    def prediction_to_human(self, pred_ids, vocab: List[str], batch):
        if isinstance(pred_ids, torch.Tensor):
            pred_ids = pred_ids.tolist()
        sents = batch.get(f'{self.config.token_key}_')
        if not sents:
            sents = batch[self.config.token_key]
        dict_tags: DictInterface = self.dict_tags
        for each, sent in zip(pred_ids, sents):
            tags = [vocab[id] for id in each[:len(sent)]]
            if dict_tags:
                for begin, end, label in dict_tags.tokenize(sent):
                    tags[begin:end] = label
            yield tags

    @property
    def tagging_scheme(self):
        tagging_scheme = self.config.tagging_scheme
        if not tagging_scheme:
            self.config.tagging_scheme = tagging_scheme = guess_tagging_scheme(self.vocabs.tag.idx_to_token)
            if tagging_scheme == 'BIO':
                warnings.warn(f'The tag scheme for {self.vocabs.tag.idx_to_token} might be IOB1 or IOB2 '
                              f'but we are using IOB2 by default. Please set tagging_scheme="IOB1" or tagging_scheme="BIO" '
                              f'to get rid of this warning.')
        return tagging_scheme

    @property
    def dict_tags(self) -> DictInterface:
        r""" A custom dictionary to override predicted tags by performing longest-prefix-matching.

        Examples:
            >>> pos.dict_tags = {'HanLP': 'state-of-the-art-tool'} # Force 'HanLP' to be 'state-of-the-art-tool'
            >>> tagger("HanLP为生产环境带来次世代最先进的多语种NLP技术。")
                # HanLP/state-of-the-art-tool 为/P 生产/NN 环境/NN 带来/VV 次世代/NN 最/AD 先进/VA 的/DEC 多语种/NN NLP/NR 技术/NN 。/PU
            >>> pos.dict_tags = {('的', '希望'): ('补语成分', '名词'), '希望': '动词'} # Conditional matching
            >>> tagger("我的希望是希望张晚霞的背影被晚霞映红。")
                # 我/PN 的/补语成分 希望/名词 是/VC 希望/动词 张晚霞/NR 的/DEG 背影/NN 被/LB 晚霞/NN 映红/VV 。/PU
        """
        return self.config.get('dict_tags', None)

    @dict_tags.setter
    def dict_tags(self,
                  dictionary: Union[DictInterface, Union[Dict[Union[str, Sequence[str]], Union[str, Sequence[str]]]]]):
        if dictionary is not None and not isinstance(dictionary, DictInterface):
            assert isinstance(dictionary, dict), f'Expected dictionary to be `dict` but got {type(dictionary)}.'
            _d = dict()
            for k, v in dictionary.items():
                if isinstance(k, str):
                    k = (k,)
                if isinstance(v, str):
                    v = (v,) * len(k)
                _d[k] = v
            dictionary = TupleTrieDict(_d)
        self.config.dict_tags = dictionary


================================================
FILE: hanlp/components/taggers/tagger_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-25 21:49
import logging
from abc import ABC

import tensorflow as tf

from hanlp.common.keras_component import KerasComponent
from hanlp.layers.crf.crf_layer_tf import CRF, CRFLoss, CRFWrapper
from hanlp.metrics.chunking.iobes_tf import IOBES_F1_TF


class TaggerComponent(KerasComponent, ABC):

    def build_metrics(self, metrics, logger: logging.Logger, **kwargs):
        if metrics == 'f1':
            assert hasattr(self.transform, 'tag_vocab'), 'Name your tag vocab tag_vocab in your transform ' \
                                                         'or override build_metrics'
            if not self.config.get('run_eagerly', None):
                logger.debug('ChunkingF1 runs only under eager mode, '
                             'set run_eagerly=True to remove this warning')
            self.config.run_eagerly = True
            return IOBES_F1_TF(self.transform.tag_vocab)
        return super().build_metrics(metrics, logger, **kwargs)

    def build_loss(self, loss, **kwargs):
        assert self.model is not None, 'should create model before build loss'
        if loss == 'crf':
            if isinstance(self.model, tf.keras.models.Sequential):
                crf = CRF(len(self.transform.tag_vocab))
                self.model.add(crf)
                loss = CRFLoss(crf, self.model.dtype)
            else:
                self.model = CRFWrapper(self.model, len(self.transform.tag_vocab))
                loss = CRFLoss(self.model.crf, self.model.dtype)
            return loss
        return super().build_loss(loss, **kwargs)


================================================
FILE: hanlp/components/taggers/transformers/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 13:57

================================================
FILE: hanlp/components/taggers/transformers/metrics_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-30 16:33
import tensorflow as tf


class Accuracy(tf.keras.metrics.SparseCategoricalAccuracy):

    def __init__(self, name='sparse_categorical_accuracy', dtype=None, mask_value=0):
        super().__init__(name, dtype)
        self.mask_value = mask_value

    def update_state(self, y_true, y_pred, sample_weight=None):
        sample_weight = tf.not_equal(y_true, self.mask_value)
        return super().update_state(y_true, y_pred, sample_weight)


================================================
FILE: hanlp/components/taggers/transformers/transformer_tagger.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-15 20:55
import logging
from typing import Union, List

import torch
from torch import nn
from torch.utils.data import DataLoader

from hanlp.common.dataset import PadSequenceDataLoader, SamplerBuilder, TransformableDataset
from hanlp.common.structure import History
from hanlp.common.transform import FieldLength, TransformList
from hanlp.common.vocab import Vocab
from hanlp.components.classifiers.transformer_classifier import TransformerComponent
from hanlp.components.taggers.tagger import Tagger
from hanlp.datasets.ner.loaders.tsv import TSVTaggingDataset
from hanlp.layers.crf.crf import CRF
from hanlp.layers.embeddings.embedding import EmbeddingDim, Embedding
from hanlp.layers.transformers.encoder import TransformerEncoder
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
from hanlp.utils.time_util import CountdownTimer
from hanlp.utils.torch_util import clip_grad_norm, lengths_to_mask, filter_state_dict_safely
from hanlp_common.util import merge_locals_kwargs


# noinspection PyAbstractClass
class TransformerTaggingModel(nn.Module):
    def __init__(self,
                 encoder: TransformerEncoder,
                 num_labels,
                 crf=False,
                 secondary_encoder=None,
                 extra_embeddings: EmbeddingDim = None) -> None:
        """
        A shallow tagging model use transformer as decoder.
        Args:
            encoder: A pretrained transformer.
            num_labels: Size of tagset.
            crf: True to enable CRF.
            extra_embeddings: Extra embeddings which will be concatenated to the encoder outputs.
        """
        super().__init__()
        self.encoder = encoder
        self.secondary_encoder = secondary_encoder
        self.extra_embeddings = extra_embeddings
        # noinspection PyUnresolvedReferences
        feature_size = encoder.transformer.config.hidden_size
        if extra_embeddings:
            feature_size += extra_embeddings.get_output_dim()
        self.classifier = nn.Linear(feature_size, num_labels)
        self.crf = CRF(num_labels) if crf else None

    def forward(self, lens: torch.LongTensor, input_ids, token_span, token_type_ids=None, batch=None):
        mask = lengths_to_mask(lens)
        x = self.encoder(input_ids, token_span=token_span, token_type_ids=token_type_ids)
        if self.secondary_encoder:
            x = self.secondary_encoder(x, mask=mask)
        if self.extra_embeddings:
            # noinspection PyCallingNonCallable
            embed = self.extra_embeddings(batch, mask=mask)
            x = torch.cat([x, embed], dim=-1)
        x = self.classifier(x)
        return x, mask


class TransformerTagger(TransformerComponent, Tagger):

    def __init__(self, **kwargs) -> None:
        """A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for
        any tagging tasks including PoS tagging and many others.

        Args:
            **kwargs: Not used.
        """
        super().__init__(**kwargs)
        self._tokenizer_transform = None
        self.model: TransformerTaggingModel = None

    # noinspection PyMethodOverriding
    def fit_dataloader(self,
                       trn: DataLoader,
                       criterion,
                       optimizer,
                       metric,
                       logger: logging.Logger,
                       history: History,
                       gradient_accumulation=1,
                       grad_norm=None,
                       transformer_grad_norm=None,
                       teacher: Tagger = None,
                       kd_criterion=None,
                       temperature_scheduler=None,
                       ratio_width=None,
                       eval_trn=True,
                       **kwargs):
        optimizer, scheduler = optimizer
        if teacher:
            scheduler, lambda_scheduler = scheduler
        else:
            lambda_scheduler = None
        self.model.train()
        timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation))
        total_loss = 0
        for idx, batch in enumerate(trn):
            out, mask = self.feed_batch(batch)
            y = batch['tag_id']
            loss = self.compute_loss(criterion, out, y, mask)
            if gradient_accumulation and gradient_accumulation > 1:
                loss /= gradient_accumulation
            if teacher:
                with torch.no_grad():
                    out_T, _ = teacher.feed_batch(batch)
                # noinspection PyNoneFunctionAssignment
                kd_loss = self.compute_distill_loss(kd_criterion, out, out_T, mask, temperature_scheduler)
                _lambda = float(lambda_scheduler)
                loss = _lambda * loss + (1 - _lambda) * kd_loss
            loss.backward()
            total_loss += loss.item()
            if eval_trn:
                prediction = self.decode_output(out, mask, batch)
                self.update_metrics(metric, out, y, mask, batch, prediction)
            if history.step(gradient_accumulation):
                self._step(optimizer, scheduler, grad_norm, transformer_grad_norm, lambda_scheduler)
                report = f'loss: {total_loss / (idx + 1):.4f} {metric if eval_trn else ""}'
                timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width)
            del loss
            del out
            del mask

    def _step(self, optimizer, scheduler, grad_norm, transformer_grad_norm, lambda_scheduler):
        clip_grad_norm(self.model, grad_norm, self.model.encoder.transformer, transformer_grad_norm)
        optimizer.step()
        scheduler.step()
        if lambda_scheduler:
            lambda_scheduler.step()
        optimizer.zero_grad()

    def compute_distill_loss(self, kd_criterion, out_S, out_T, mask, temperature_scheduler):
        logits_S = out_S[mask]
        logits_T = out_T[mask]
        temperature = temperature_scheduler(logits_S, logits_T)
        return kd_criterion(logits_S, logits_T, temperature)

    def build_model(self, training=True, extra_embeddings: Embedding = None, finetune=False, logger=None,
                    **kwargs) -> torch.nn.Module:
        model = TransformerTaggingModel(
            self.build_transformer(training=training),
            len(self.vocabs.tag),
            self.config.crf,
            self.config.get('secondary_encoder', None),
            extra_embeddings=extra_embeddings.module(self.vocabs) if extra_embeddings else None,
        )
        if finetune and self.model:
            model_state = model.state_dict()
            load_state = self.model.state_dict()
            safe_state = filter_state_dict_safely(model_state, load_state)
            missing_params = model_state.keys() - safe_state.keys()
            if missing_params:
                logger.info(f'The following parameters were missing from the checkpoint: '
                            f'{", ".join(sorted(missing_params))}.')
            model.load_state_dict(safe_state, strict=False)
            n = self.model.classifier.bias.size(0)
            if model.classifier.bias.size(0) != n:
                model.classifier.weight.data[:n, :] = self.model.classifier.weight.data[:n, :]
                model.classifier.bias.data[:n] = self.model.classifier.bias.data[:n]
        return model

    # noinspection PyMethodOverriding
    def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger = None,
                         sampler_builder: SamplerBuilder = None, gradient_accumulation=1,
                         extra_embeddings: Embedding = None, transform=None, max_seq_len=None, **kwargs) -> DataLoader:
        if isinstance(data, TransformableDataset):
            dataset = data
        else:
            args = dict((k, self.config.get(k, None)) for k in
                        ['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint'])
            dataset = self.build_dataset(data, **args)
        if self.config.token_key is None:
            self.config.token_key = next(iter(dataset[0]))
            logger.info(
                f'Guess [bold][blue]token_key={self.config.token_key}[/blue][/bold] according to the '
                f'training dataset: [blue]{dataset}[/blue]')
        if transform:
            dataset.append_transform(transform)
        if extra_embeddings:
            dataset.append_transform(extra_embeddings.transform(self.vocabs))
        dataset.append_transform(self.tokenizer_transform)
        dataset.append_transform(self.last_transform())
        if not isinstance(data, list):
            dataset.purge_cache()
        if self.vocabs.mutable:
            self.build_vocabs(dataset, logger)
        if isinstance(data, str) and max_seq_len:
            token_key = self.config.token_key
            dataset.prune(lambda x: len(x[token_key]) > max_seq_len, logger)
        if sampler_builder is not None:
            sampler = sampler_builder.build([len(x[f'{self.config.token_key}_input_ids']) for x in dataset], shuffle,
                                            gradient_accumulation=gradient_accumulation if shuffle else 1)
        else:
            sampler = None
        return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler)

    def build_dataset(self, data, transform=None, **kwargs):
        return TSVTaggingDataset(data, transform=transform, **kwargs)

    def last_transform(self):
        transforms = TransformList(self.vocabs, FieldLength(self.config.token_key))
        return transforms

    @property
    def tokenizer_transform(self) -> TransformerSequenceTokenizer:
        if not self._tokenizer_transform:
            self._tokenizer_transform = TransformerSequenceTokenizer(self.transformer_tokenizer,
                                                                     self.config.token_key,
                                                                     ret_token_span=True)
        return self._tokenizer_transform

    def build_vocabs(self, trn, logger, **kwargs):
        if 'tag' not in self.vocabs:
            self.vocabs.tag = Vocab(pad_token=None, unk_token=None)
        timer = CountdownTimer(len(trn))
        max_seq_len = 0
        token_key = self.config.token_key
        for each in trn:
            max_seq_len = max(max_seq_len, len(each[token_key]))
            timer.log(f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})')
        self.vocabs.tag.set_unk_as_safe_unk()
        self.vocabs.lock()
        self.vocabs.summary(logger)

    # noinspection PyMethodOverriding
    def fit(self,
            trn_data,
            dev_data,
            save_dir,
            transformer,
            average_subwords=False,
            word_dropout: float = 0.2,
            hidden_dropout=None,
            layer_dropout=0,
            scalar_mix=None,
            mix_embedding: int = 0,
            grad_norm=5.0,
            transformer_grad_norm=None,
            lr=5e-5,
            transformer_lr=None,
            transformer_layers=None,
            gradient_accumulation=1,
            adam_epsilon=1e-6,
            weight_decay=0,
            warmup_steps=0.1,
            secondary_encoder=None,
            extra_embeddings: Embedding = None,
            crf=False,
            reduction='sum',
            batch_size=32,
            sampler_builder: SamplerBuilder = None,
            epochs=3,
            patience=5,
            token_key=None,
            max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False,
            transform=None,
            logger=None,
            devices: Union[float, int, List[int]] = None,
            **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def feed_batch(self, batch: dict):
        features = [batch[k] for k in self.tokenizer_transform.output_key]
        if len(features) == 2:
            input_ids, token_span = features
        else:
            input_ids, token_span = features[0], None
        lens = batch[f'{self.config.token_key}_length']
        x, mask = self.model(lens, input_ids, token_span, batch.get(f'{self.config.token_key}_token_type_ids'),
                             batch=batch)
        return x, mask

    # noinspection PyMethodOverriding
    def distill(self,
                teacher: str,
                trn_data,
                dev_data,
                save_dir,
                transformer: str,
                batch_size=None,
                temperature_scheduler='flsw',
                epochs=None,
                devices=None,
                logger=None,
                seed=None,
                **kwargs):
        return super().distill(**merge_locals_kwargs(locals(), kwargs))


================================================
FILE: hanlp/components/taggers/transformers/transformer_tagger_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 13:55
import math

import tensorflow as tf

from hanlp.common.transform_tf import Transform
from hanlp.components.taggers.tagger_tf import TaggerComponent
from hanlp.components.taggers.transformers.transformer_transform_tf import TransformerTransform
from hanlp.layers.transformers.loader_tf import build_transformer
from hanlp.layers.transformers.utils_tf import build_adamw_optimizer
from hanlp.losses.sparse_categorical_crossentropy import SparseCategoricalCrossentropyOverBatchFirstDim
from hanlp_common.util import merge_locals_kwargs


class TransformerTaggingModel(tf.keras.Model):
    def __init__(self, transformer: tf.keras.Model, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.transformer = transformer

    def call(self, inputs, training=None, mask=None):
        return super().call(inputs, training, mask)


class TransformerTaggerTF(TaggerComponent):
    def __init__(self, transform: TransformerTransform = None) -> None:
        if transform is None:
            transform = TransformerTransform()
        super().__init__(transform)
        self.transform: TransformerTransform = transform

    def build_model(self, transformer, max_seq_length, **kwargs) -> tf.keras.Model:
        model, tokenizer = build_transformer(transformer, max_seq_length, len(self.transform.tag_vocab), tagging=True)
        self.transform.tokenizer = tokenizer
        return model

    def fit(self, trn_data, dev_data, save_dir,
            transformer,
            optimizer='adamw',
            learning_rate=5e-5,
            weight_decay_rate=0,
            epsilon=1e-8,
            clipnorm=1.0,
            warmup_steps_ratio=0,
            use_amp=False,
            max_seq_length=128,
            batch_size=32,
            epochs=3,
            metrics='accuracy',
            run_eagerly=False,
            logger=None,
            verbose=True,
            **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    # noinspection PyMethodOverriding
    def build_optimizer(self, optimizer, learning_rate, epsilon, weight_decay_rate, clipnorm, use_amp, train_steps,
                        warmup_steps, **kwargs):
        if optimizer == 'adamw':
            opt = build_adamw_optimizer(self.config, learning_rate, epsilon, clipnorm, train_steps, use_amp,
                                        warmup_steps, weight_decay_rate)
        else:
            opt = super().build_optimizer(optimizer)
        return opt

    def build_vocab(self, trn_data, logger):
        train_examples = super().build_vocab(trn_data, logger)
        warmup_steps_per_epoch = math.ceil(train_examples * self.config.warmup_steps_ratio / self.config.batch_size)
        self.config.warmup_steps = warmup_steps_per_epoch * self.config.epochs
        return train_examples

    def train_loop(self, trn_data, dev_data, epochs, num_examples, train_steps_per_epoch, dev_steps, model, optimizer,
                   loss, metrics, callbacks, logger, **kwargs):
        history = self.model.fit(trn_data, epochs=epochs, steps_per_epoch=train_steps_per_epoch,
                                 validation_data=dev_data,
                                 callbacks=callbacks,
                                 validation_steps=dev_steps,
                                 # mask out padding labels
                                 # class_weight=dict(
                                 #     (i, 0 if i == 0 else 1) for i in range(len(self.transform.tag_vocab)))
                                 )  # type:tf.keras.callbacks.History
        return history

    def build_loss(self, loss, **kwargs):
        if not loss:
            return SparseCategoricalCrossentropyOverBatchFirstDim()
        return super().build_loss(loss, **kwargs)

    def load_transform(self, save_dir) -> Transform:
        super().load_transform(save_dir)
        self.transform.tokenizer = build_transformer(self.config.transformer, self.config.max_seq_length,
                                                     len(self.transform.tag_vocab), tagging=True, tokenizer_only=True)
        return self.transform


================================================
FILE: hanlp/components/taggers/transformers/transformer_transform_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 15:14
from typing import Union, Tuple, List, Iterable

import tensorflow as tf

from hanlp_common.structure import SerializableDict
from hanlp.common.transform_tf import Transform
from hanlp.common.vocab_tf import VocabTF
from hanlp.layers.transformers.utils_tf import convert_examples_to_features
from hanlp.transform.tsv_tf import TsvTaggingFormat


class TransformerTransform(TsvTaggingFormat, Transform):
    def __init__(self,
                 tokenizer=None,
                 config: SerializableDict = None,
                 map_x=False, map_y=False, **kwargs) -> None:
        super().__init__(config, map_x, map_y, **kwargs)
        self._tokenizer = tokenizer
        self.tag_vocab: VocabTF = None
        self.special_token_ids = None
        self.pad = '[PAD]'
        self.unk = '[UNK]'

    @property
    def max_seq_length(self):
        # -2 for special tokens [CLS] and [SEP]
        return self.config.get('max_seq_length', 128) - 2

    @property
    def tokenizer(self):
        return self._tokenizer

    @tokenizer.setter
    def tokenizer(self, tokenizer):
        self._tokenizer = tokenizer
        vocab = tokenizer._vocab if hasattr(tokenizer, '_vocab') else tokenizer.vocab
        if self.pad not in vocab:
            # English albert use <pad> instead of [PAD]
            self.pad = '<pad>'
        if self.unk not in vocab:
            self.unk = '<unk>'
        self.special_token_ids = tf.constant([vocab[token] for token in [self.pad, '[CLS]', '[SEP]']],
                                             dtype=tf.int32)

    def fit(self, trn_path: str, **kwargs) -> int:
        self.tag_vocab = VocabTF(unk_token=None)
        num_samples = 0
        for words, tags in self.file_to_inputs(trn_path, gold=True):
            num_samples += 1
            self.tag_vocab.update(tags)
        return num_samples

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        max_seq_length = self.config.get('max_seq_length', 128)
        types = (tf.int32, tf.int32, tf.int32), tf.int32
        # (input_ids, input_mask, segment_ids), label_ids
        shapes = ([max_seq_length], [max_seq_length], [max_seq_length]), [None]
        values = (0, 0, 0), self.tag_vocab.pad_idx
        return types, shapes, values

    def lock_vocabs(self):
        super().lock_vocabs()

    def inputs_to_samples(self, inputs, gold=False):
        max_seq_length = self.config.get('max_seq_length', 128)
        tokenizer = self._tokenizer
        xlnet = False
        roberta = False
        pad_token = self.pad
        cls_token = '[CLS]'
        sep_token = '[SEP]'
        unk_token = self.unk

        pad_label_idx = self.tag_vocab.pad_idx
        pad_token = tokenizer.convert_tokens_to_ids([pad_token])[0]
        for sample in inputs:
            if gold:
                words, tags = sample
            else:
                words, tags = sample, [self.tag_vocab.idx_to_token[1]] * len(sample)

            input_ids, input_mask, segment_ids, label_ids = convert_examples_to_features(words,
                                                                                         max_seq_length, tokenizer,
                                                                                         tags,
                                                                                         self.tag_vocab.token_to_idx,
                                                                                         cls_token_at_end=xlnet,
                                                                                         # xlnet has a cls token at the end
                                                                                         cls_token=cls_token,
                                                                                         cls_token_segment_id=2 if xlnet else 0,
                                                                                         sep_token=sep_token,
                                                                                         sep_token_extra=roberta,
                                                                                         # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                                                                                         pad_on_left=xlnet,
                                                                                         # pad on the left for xlnet
                                                                                         pad_token_id=pad_token,
                                                                                         pad_token_segment_id=4 if xlnet else 0,
                                                                                         pad_token_label_id=pad_label_idx,
                                                                                         unk_token=unk_token)

            if None in input_ids:
                print(input_ids)
            if None in input_mask:
                print(input_mask)
            if None in segment_ids:
                print(input_mask)
            yield (input_ids, input_mask, segment_ids), label_ids

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        raise NotImplementedError('transformers has its own tagger, not need to convert idx for x')

    def y_to_idx(self, y) -> tf.Tensor:
        raise NotImplementedError('transformers has its own tagger, not need to convert idx for y')

    def input_is_single_sample(self, input: Union[List[str], List[List[str]]]) -> bool:
        return isinstance(input[0], str)

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, X=None, inputs=None, batch=None,
                     **kwargs) -> Iterable:
        assert batch is not None, 'Need the batch to know actual length of Y'
        label_mask = batch[1]
        if self.tag_vocab.pad_token:
            Y[:, :, self.tag_vocab.pad_idx] = float('-inf')
        Y = tf.argmax(Y, axis=-1)
        Y = Y[label_mask > 0]
        tags = [self.tag_vocab.idx_to_token[tid] for tid in Y]
        offset = 0
        for words in inputs:
            yield tags[offset:offset + len(words)]
            offset += len(words)


================================================
FILE: hanlp/components/taggers/util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-01 00:31
from typing import List, Tuple
from hanlp.utils.span_util import allowed_transitions


def guess_tagging_scheme(labels: List[str]) -> str:
    tagset = set(y.split('-')[0] for y in labels)
    for scheme in "BIO", "BIOUL", "BMES", 'IOBES':
        if tagset == set(list(scheme)):
            return scheme


def guess_allowed_transitions(labels) -> List[Tuple[int, int]]:
    scheme = guess_tagging_scheme(labels)
    if not scheme:
        return None
    if scheme == 'IOBES':
        scheme = 'BIOUL'
        labels = [y.replace('E-', 'L-').replace('S-', 'U-') for y in labels]
    return allowed_transitions(scheme, dict(enumerate(labels)))


================================================
FILE: hanlp/components/tokenizers/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-11 02:48

================================================
FILE: hanlp/components/tokenizers/multi_criteria_cws_transformer.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-10-21 19:55
from typing import List, Union

from hanlp.common.dataset import SamplerBuilder
from hanlp.components.taggers.transformers.transformer_tagger import TransformerTagger
from hanlp.components.tokenizers.transformer import TransformerTaggingTokenizer
from hanlp.datasets.tokenization.loaders.multi_criteria_cws.mcws_dataset import MultiCriteriaTextTokenizingDataset, append_criteria_token
import functools

from hanlp.metrics.f1 import F1
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs


class MultiCriteriaTransformerTaggingTokenizer(TransformerTaggingTokenizer):
    def __init__(self, **kwargs) -> None:
        r"""Transformer based implementation of "Effective Neural Solution for Multi-Criteria Word Segmentation"
        (:cite:`he2019effective`). It uses an artificial token ``[unused_i]`` instead of ``[SEP]`` in the input_ids to
        mark the i-th segmentation criteria.

        Args:
            **kwargs: Not used.
        """
        super().__init__(**kwargs)

    def build_dataset(self, data, **kwargs):
        return MultiCriteriaTextTokenizingDataset(data, **kwargs)

    def on_config_ready(self, **kwargs):
        super().on_config_ready(**kwargs)
        # noinspection PyAttributeOutsideInit
        if 'criteria_token_map' not in self.config:
            unused_tokens = [f'[unused{i}]' for i in range(1, 100)]
            ids = self.transformer_tokenizer.convert_tokens_to_ids(unused_tokens)
            self.config.unused_tokens = dict((x, ids[i]) for i, x in enumerate(unused_tokens) if
                                             ids[i] != self.transformer_tokenizer.unk_token_id)
            self.config.criteria_token_map = dict()

    def last_transform(self):
        transforms = super().last_transform()
        transforms.append(functools.partial(append_criteria_token,
                                            criteria_tokens=self.config.unused_tokens,
                                            criteria_token_map=self.config.criteria_token_map))
        return transforms

    def build_vocabs(self, trn, logger, **kwargs):
        super().build_vocabs(trn, logger, **kwargs)
        logger.info(f'criteria[{len(self.config.criteria_token_map)}] = {list(self.config.criteria_token_map)}')

    def feed_batch(self, batch: dict):
        x, mask = TransformerTagger.feed_batch(self, batch)
        # strip [CLS], [SEP] and [unused_i]
        return x[:, 1:-2, :], mask

    def build_samples(self, data: List[str], criteria=None, **kwargs):
        if not criteria:
            criteria = next(iter(self.config.criteria_token_map.keys()))
        else:
            assert criteria in self.config.criteria_token_map, \
                f'Unsupported criteria {criteria}. Choose one from {list(self.config.criteria_token_map.keys())}'
        samples = super().build_samples(data, **kwargs)
        for sample in samples:
            sample['criteria'] = criteria
        return samples

    def build_metric(self, **kwargs):
        metrics = MetricDict()
        for criteria in self.config.criteria_token_map:
            metrics[criteria] = F1()
        return metrics

    def update_metrics(self, metric, logits, y, mask, batch, prediction):
        for p, g, c in zip(prediction, self.tag_to_span(batch['tag']), batch['criteria']):
            pred = set(p)
            gold = set(g)
            metric[c](pred, gold)

    def fit(self, trn_data, dev_data, save_dir, transformer, average_subwords=False, word_dropout: float = 0.2,
            hidden_dropout=None, layer_dropout=0, scalar_mix=None, mix_embedding: int = 0, grad_norm=5.0,
            transformer_grad_norm=None, lr=5e-5,
            transformer_lr=None, transformer_layers=None, gradient_accumulation=1,
            adam_epsilon=1e-8, weight_decay=0, warmup_steps=0.1, crf=False, reduction='sum',
            batch_size=32, sampler_builder: SamplerBuilder = None, epochs=30, patience=5, token_key=None,
            tagging_scheme='BMES', delimiter=None,
            max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, transform=None, logger=None,
            devices: Union[float, int, List[int]] = None, **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))


================================================
FILE: hanlp/components/tokenizers/tok.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-12 13:08
from typing import Any, Callable

from hanlp.components.taggers.rnn_tagger import RNNTagger
from hanlp.datasets.tokenization.loaders.chunking_dataset import ChunkingDataset
from hanlp.metrics.chunking.chunking_f1 import ChunkingF1
from hanlp.utils.span_util import bmes_to_words
from hanlp_common.util import merge_locals_kwargs


class RNNTokenizer(RNNTagger):

    def predict(self, sentence: Any, batch_size: int = None, **kwargs):
        flat = isinstance(sentence, str)
        if flat:
            sentence = [sentence]
        for i, s in enumerate(sentence):
            sentence[i] = list(s)
        outputs = RNNTagger.predict(self, sentence, batch_size, **kwargs)
        if flat:
            return outputs[0]
        return outputs

    def predict_data(self, data, batch_size, **kwargs):
        tags = RNNTagger.predict_data(self, data, batch_size, **kwargs)
        words = [bmes_to_words(c, t) for c, t in zip(data, tags)]
        return words

    def build_dataset(self, data, transform=None):
        dataset = ChunkingDataset(data)
        if 'transform' in self.config:
            dataset.append_transform(self.config.transform)
        if transform:
            dataset.append_transform(transform)
        return dataset

    def build_metric(self, **kwargs):
        return ChunkingF1()

    def update_metrics(self, metric, logits, y, mask, batch):
        pred = self.decode_output(logits, mask, batch)
        pred = self._id_to_tags(pred)
        gold = batch['tag']
        metric(pred, gold)

    def fit(self, trn_data, dev_data, save_dir, batch_size=50, epochs=100, embed=100, rnn_input=None, rnn_hidden=256,
            drop=0.5, lr=0.001, patience=10, crf=True, optimizer='adam', token_key='char', tagging_scheme=None,
            anneal_factor: float = 0.5, anneal_patience=2, devices=None, logger=None,
            verbose=True, transform: Callable = None, **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))


================================================
FILE: hanlp/components/tokenizers/tok_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-27 14:30
import logging
from typing import Union, Any, List, Tuple, Iterable

import tensorflow as tf

from hanlp.common.keras_component import KerasComponent
from hanlp.components.taggers.ngram_conv.ngram_conv_tagger import NgramTransform, NgramConvTaggerTF
from hanlp.components.taggers.rnn_tagger_tf import RNNTaggerTF
from hanlp.components.taggers.transformers.transformer_tagger_tf import TransformerTaggerTF
from hanlp.components.taggers.transformers.transformer_transform_tf import TransformerTransform
from hanlp.losses.sparse_categorical_crossentropy import SparseCategoricalCrossentropyOverBatchFirstDim
from hanlp.metrics.chunking.bmes_tf import BMES_F1_TF
from hanlp.transform.tsv_tf import TSVTaggingTransform
from hanlp.transform.txt_tf import TxtFormat, TxtBMESFormat, extract_ngram_features_and_tags, bmes_to_words
from hanlp_common.util import merge_locals_kwargs


class BMESTokenizerTF(KerasComponent):

    def build_metrics(self, metrics, logger: logging.Logger, **kwargs):
        if metrics == 'f1':
            self.config.run_eagerly = True
            return BMES_F1_TF(self.transform.tag_vocab)
        return super().build_metrics(metrics, logger, **kwargs)


class NgramConvTokenizerTransform(TxtFormat, NgramTransform):

    def inputs_to_samples(self, inputs, gold=False):
        if self.input_is_single_sample(inputs):
            inputs = [inputs]
        for sent in inputs:
            # bigram_only = false
            yield extract_ngram_features_and_tags(sent, False, self.config.window_size, gold)

    def input_is_single_sample(self, input: Union[List[str], List[List[str]]]) -> bool:
        if not input:
            return True
        return isinstance(input, str)

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None,
                     **kwargs) -> Iterable:
        yield from TxtBMESFormat.Y_to_tokens(self, self.tag_vocab, Y, gold, inputs)


class NgramConvTokenizerTF(BMESTokenizerTF, NgramConvTaggerTF):

    def __init__(self) -> None:
        super().__init__(NgramConvTokenizerTransform())

    def fit(self, trn_data: Any, dev_data: Any, save_dir: str, word_embed: Union[str, int, dict] = 200,
            ngram_embed: Union[str, int, dict] = 50, embedding_trainable=True, window_size=4, kernel_size=3,
            filters=(200, 200, 200, 200, 200), dropout_embed=0.2, dropout_hidden=0.2, weight_norm=True,
            loss: Union[tf.keras.losses.Loss, str] = None,
            optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='f1', batch_size=100,
            epochs=100, logger=None, verbose=True, **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def evaluate_output_to_file(self, batch, outputs, out):
        for x, y_pred in zip(self.transform.X_to_inputs(batch[0]),
                             self.transform.Y_to_outputs(outputs, gold=False)):
            out.write(self.transform.input_truth_output_to_str(x, None, y_pred))
            out.write('\n')

    def build_loss(self, loss, **kwargs):
        if loss is None:
            return SparseCategoricalCrossentropyOverBatchFirstDim()
        return super().build_loss(loss, **kwargs)


class TransformerTokenizerTransform(TxtBMESFormat, TransformerTransform):

    def inputs_to_samples(self, inputs, gold=False):
        yield from TransformerTransform.inputs_to_samples(self, TxtBMESFormat.inputs_to_samples(self, inputs, gold),
                                                          True)

    def Y_to_tokens(self, tag_vocab, Y, gold, inputs):
        if not gold:
            Y = tf.argmax(Y, axis=2)
        for text, ys in zip(inputs, Y):
            tags = [tag_vocab.idx_to_token[int(y)] for y in ys[1:len(text) + 1]]
            yield bmes_to_words(list(text), tags)


class TransformerTokenizerTF(BMESTokenizerTF, TransformerTaggerTF):
    def __init__(self, transform: TransformerTokenizerTransform = None) -> None:
        if transform is None:
            transform = TransformerTokenizerTransform()
        super().__init__(transform)


class RNNTokenizerTransform(TxtBMESFormat, TSVTaggingTransform):
    pass


class RNNTokenizerTF(BMESTokenizerTF, RNNTaggerTF):
    def __init__(self, transform: RNNTokenizerTransform = None) -> None:
        if not transform:
            transform = RNNTokenizerTransform()
        super().__init__(transform)

    def fit(self, trn_data: str, dev_data: str = None, save_dir: str = None, embeddings=100, embedding_trainable=False,
            rnn_input_dropout=0.2, rnn_units=100, rnn_output_dropout=0.2, epochs=20, lower=False, max_seq_len=50,
            logger=None, loss: Union[tf.keras.losses.Loss, str] = None,
            optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='f1', batch_size=32,
            dev_batch_size=32, lr_decay_per_epoch=None, verbose=True, **kwargs):
        return super().fit(**merge_locals_kwargs(locals(), kwargs))


================================================
FILE: hanlp/components/tokenizers/transformer.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-11 02:48
import functools
from typing import TextIO, Union, List, Dict, Any, Set

import torch
from hanlp.common.dataset import SamplerBuilder
from hanlp.common.transform import TransformList
from hanlp.components.taggers.transformers.transformer_tagger import TransformerTagger
from hanlp.datasets.tokenization.loaders.txt import TextTokenizingDataset, generate_tags_for_subtokens
from hanlp.metrics.f1 import F1
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
from hanlp.utils.span_util import bmes_to_spans
from hanlp.utils.string_util import possible_tokenization
from hanlp_common.util import merge_locals_kwargs
from hanlp_trie import DictInterface, TrieDict
from hanlp_trie.dictionary import TupleTrieDict


class TransformerTaggingTokenizer(TransformerTagger):

    def __init__(self, **kwargs) -> None:
        """ A tokenizer using transformer tagger for span prediction. It features with 2 high performance dictionaries
        to handle edge cases in real application.

        - ``dict_force``: High priority dictionary performs longest-prefix-matching on input text which takes higher
          priority over model predictions.
        - ``dict_combine``: Low priority dictionary performs longest-prefix-matching on model predictions then
          combines them.

        .. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can
            do and what it can't do. The tutorial in `this book <http://nlp.hankcs.com/book.php>`_ can be very helpful.

        It also supports outputting the span of each token by setting ``config.output_spans = True``.

        Args:
            **kwargs: Predefined config.
        """
        super().__init__(**kwargs)

    @property
    def dict_force(self) -> DictInterface:
        r""" The high priority dictionary which perform longest-prefix-matching on inputs to split them into two subsets:

        1. spans containing no keywords, which are then fed into tokenizer for further tokenization.
        2. keywords, which will be outputed without furthur tokenization.

        .. Caution::
            Longest-prefix-matching **NEVER** guarantee the presence of any keywords. Abuse of
            ``dict_force`` can lead to low quality results. For more details, refer to
            `this book <http://nlp.hankcs.com/book.php>`_.

        Examples:
            >>> tok.dict_force = {'和服', '服务行业'} # Force '和服' and '服务行业' by longest-prefix-matching
            >>> tok("商品和服务行业")
                ['商品', '和服', '务行业']
            >>> tok.dict_force = {'和服务': ['和', '服务']} # Force '和服务' to be tokenized as ['和', '服务']
            >>> tok("商品和服务行业")
                ['商品', '和', '服务', '行业']
        """
        return self.config.get('dict_force', None)

    @dict_force.setter
    def dict_force(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]):
        if dictionary is not None and not isinstance(dictionary, DictInterface):
            dictionary = TrieDict(dictionary)
        self.config.dict_force = dictionary
        self.tokenizer_transform.dict = dictionary

    @property
    def dict_combine(self) -> DictInterface:
        """ The low priority dictionary which perform longest-prefix-matching on model predictions and combing them.

        Examples:
            >>> tok.dict_combine = {'和服', '服务行业'}
            >>> tok("商品和服务行业") # '和服' is not in the original results ['商品', '和', '服务']. '服务', '行业' are combined to '服务行业'
                ['商品', '和', '服务行业']

        """
        return self.config.get('dict_combine', None)

    @dict_combine.setter
    def dict_combine(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]):
        if dictionary is not None and not isinstance(dictionary, DictInterface):
            if all(isinstance(k, str) for k in dictionary):
                dictionary = TrieDict(dictionary)
            else:
                _d = set()
                for k in dictionary:
                    if isinstance(k, str):
                        _d.update(possible_tokenization(k))
                    else:
                        _d.add(k)
                dictionary = TupleTrieDict(_d)
        self.config.dict_combine = dictionary

    def build_metric(self, **kwargs):
        return F1()

    # noinspection PyMethodOverriding
    def update_metrics(self, metric, logits, y, mask, batch, prediction):
        for p, g in zip(prediction, self.tag_to_span(batch['tag'], batch)):
            pred = set(p)
            gold = set(g)
            metric(pred, gold)

    def decode_output(self, logits, mask, batch, model=None):
        output = super().decode_output(logits, mask, batch, model)
        if isinstance(output, torch.Tensor):
            output = output.tolist()
        prediction = self.id_to_tags(output, [len(x) for x in batch['token']])
        return self.tag_to_span(prediction, batch)

    def tag_to_span(self, batch_tags, batch: dict):
        spans = []
        if 'custom_words' in batch:
            if self.config.tagging_scheme == 'BMES':
                S = 'S'
                M = 'M'
                E = 'E'
            else:
                S = 'B'
                M = 'I'
                E = 'I'
            for tags, custom_words in zip(batch_tags, batch['custom_words']):
                # [batch['raw_token'][0][x[0]:x[1]] for x in subwords]
                if custom_words:
                    for start, end, label in custom_words:
                        if end - start == 1:
                            tags[start] = S
                        else:
                            tags[start] = 'B'
                            tags[end - 1] = E
                            for i in range(start + 1, end - 1):
                                tags[i] = M
                        if end < len(tags):
                            tags[end] = 'B'
        if 'token_subtoken_offsets_group' not in batch:  # only check prediction on raw text for now
            # Check cases that a single char gets split into multiple subtokens, e.g., ‥ -> . + .
            for tags, subtoken_offsets in zip(batch_tags, batch['token_subtoken_offsets']):
                offset = -1  # BERT produces 'ᄒ', '##ᅡ', '##ᆫ' for '한' and they share the same span
                prev_tag = None
                for i, (tag, (b, e)) in enumerate(zip(tags, subtoken_offsets)):
                    if b < offset:
                        if prev_tag == 'S':
                            tags[i - 1] = 'B'
                        elif prev_tag == 'E':
                            tags[i - 1] = 'M'
                        tags[i] = tag = 'M'
                    offset = e
                    prev_tag = tag
        for tags in batch_tags:
            spans.append(bmes_to_spans(tags))
        return spans

    def write_prediction(self, prediction, batch, output: TextIO):
        batch_tokens = self.spans_to_tokens(prediction, batch)
        for tokens in batch_tokens:
            output.write(' '.join(tokens))
            output.write('\n')

    @property
    def tokenizer_transform(self):
        if not self._tokenizer_transform:
            self._tokenizer_transform = TransformerSequenceTokenizer(self.transformer_tokenizer,
                                                                     self.config.token_key,
                                                                     ret_subtokens=True,
                                                                     ret_subtokens_group=True,
                                                                     ret_token_span=False,
                                                                     dict_force=self.dict_force)
        return self._tokenizer_transform

    def spans_to_tokens(self, spans, batch, rebuild_span=False):
        batch_tokens = []
        dict_combine = self.dict_combine
        raw_text = batch.get('token_', None)  # Use raw text to rebuild the token according to its offset
        for b, (spans_per_sent, sub_tokens) in enumerate(zip(spans, batch[self.config.token_key])):
            if raw_text:  # This will restore iPhone X as a whole
                text = raw_text[b]
                offsets = batch['token_subtoken_offsets'][b]
                tokens = [text[offsets[b][0]:offsets[e - 1][-1]] for b, e in spans_per_sent]
            else:  # This will merge iPhone X into iPhoneX
                tokens = [''.join(sub_tokens[span[0]:span[1]]) for span in spans_per_sent]
            if dict_combine:
                buffer = []
                offset = 0
                delta = 0
                for start, end, label in dict_combine.tokenize(tokens):
                    if offset < start:
                        buffer.extend(tokens[offset:start])
                    if raw_text:
                        # noinspection PyUnboundLocalVariable
                        combined = text[offsets[spans_per_sent[start - delta][0]][0]:
                                        offsets[spans_per_sent[end - delta - 1][1] - 1][1]]
                    else:
                        combined = ''.join(tokens[start:end])
                    buffer.append(combined)
                    offset = end
                    if rebuild_span:
                        start -= delta
                        end -= delta
                        combined_span = (spans_per_sent[start][0], spans_per_sent[end - 1][1])
                        del spans_per_sent[start:end]
                        delta += end - start - 1
                        spans_per_sent.insert(start, combined_span)
                if offset < len(tokens):
                    buffer.extend(tokens[offset:])
                tokens = buffer
            batch_tokens.append(tokens)
        return batch_tokens

    def generate_prediction_filename(self, tst_data, save_dir):
        return super().generate_prediction_filename(tst_data.replace('.tsv', '.txt'), save_dir)

    def prediction_to_human(self, pred, vocab, batch, rebuild_span=False):
        output_spans = self.config.get('output_spans', None)
        tokens = self.spans_to_tokens(pred, batch, rebuild_span or output_spans)
        if output_spans:
            subtoken_spans = batch['token_subtoken_offsets']
            results = []
            for toks, offs, subs in zip(tokens, pred, subtoken_spans):
                r = []
                results.append(r)
                for t, (b, e) in zip(toks, offs):
                    r.append([t, subs[b][0], subs[e - 1][-1]])
            return results
        return tokens

    def input_is_flat(self, tokens):
        return isinstance(tokens, str)

    def build_dataset(self, data, **kwargs):
        return TextTokenizingDataset(data, **kwargs)

    def last_transform(self):
        return TransformList(functools.partial(generate_tags_for_subtokens, tagging_scheme=self.config.tagging_scheme),
                             super().last_transform())

    def fit(self, trn_data, dev_data, save_dir, transformer, average_subwords=False, word_dropout: float = 0.2,
            hidden_dropout=None, layer_dropout=0, scalar_mix=None, grad_norm=5.0,
            transformer_grad_norm=None, lr=5e-5, eval_trn=True,
            transformer_lr=None, transformer_layers=None, gradient_accumulation=1,
            adam_epsilon=1e-8, weight_decay=0, warmup_steps=0.1, crf=False, reduction='sum',
            batch_size=32, sampler_builder: SamplerBuilder = None, epochs=30, patience=5, token_key=None,
            tagging_scheme='BMES', delimiter=None,
            max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, transform=None, logger=None,
            devices: Union[float, int, List[int]] = None, **kwargs):
        """

        Args:
            trn_data: Training set.
            dev_data: Development set.
            save_dir: The directory to save trained component.
            transformer: An identifier of a pre-trained transformer.
            average_subwords: ``True`` to average subword representations.
            word_dropout: Dropout rate to randomly replace a subword with MASK.
            hidden_dropout: Dropout rate applied to hidden states.
            layer_dropout: Randomly zero out hidden states of a transformer layer.
            scalar_mix: Layer attention.
            grad_norm: Gradient norm for clipping.
            transformer_grad_norm: Gradient norm for clipping transformer gradient.
            lr: Learning rate for decoder.
            transformer_lr: Learning for encoder.
            transformer_layers: The number of bottom layers to use.
            gradient_accumulation: Number of batches per update.
            adam_epsilon: The epsilon to use in Adam.
            weight_decay: The weight decay to use.
            warmup_steps: The number of warmup steps.
            crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
            reduction: The loss reduction used in aggregating losses.
            batch_size: The number of samples in a batch.
            sampler_builder: The builder to build sampler, which will override batch_size.
            epochs: The number of epochs to train.
            patience: The number of patience epochs before early stopping.
            token_key: The key to tokens in dataset.
            tagging_scheme: Either ``BMES`` or ``BI``.
            delimiter: Delimiter between tokens used to split a line in the corpus.
            max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
            sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
                be split here.
            char_level: Whether the sequence length is measured at char level.
            hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
                in a sentence, it will be split at a token anyway.
            transform: An optional transform to be applied to samples. Usually a character normalization transform is
                passed in.
            devices: Devices this component will live on.
            logger: Any :class:`logging.Logger` instance.
            seed: Random seed to reproduce this training.
            **kwargs: Not used.

        Returns:
            Best metrics on dev set.
        """
        return super().fit(**merge_locals_kwargs(locals(), kwargs))

    def feed_batch(self, batch: dict):
        x, mask = super().feed_batch(batch)
        return x[:, 1:-1, :], mask


================================================
FILE: hanlp/datasets/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-06-13 18:15


================================================
FILE: hanlp/datasets/classification/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-11-10 11:49

================================================
FILE: hanlp/datasets/classification/sentiment.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-30 21:03
_ERNIE_TASK_DATA = 'https://ernie.bj.bcebos.com/task_data_zh.tgz#'

CHNSENTICORP_ERNIE_TRAIN = _ERNIE_TASK_DATA + 'chnsenticorp/train.tsv'
CHNSENTICORP_ERNIE_DEV = _ERNIE_TASK_DATA + 'chnsenticorp/dev.tsv'
CHNSENTICORP_ERNIE_TEST = _ERNIE_TASK_DATA + 'chnsenticorp/test.tsv'


================================================
FILE: hanlp/datasets/coref/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-04 13:39

================================================
FILE: hanlp/datasets/coref/loaders/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:03


================================================
FILE: hanlp/datasets/coref/loaders/conll12coref.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-04 15:33
import collections
import os
from typing import Union, List, Callable, DefaultDict, Tuple, Optional, Iterator

from hanlp.datasets.srl.loaders.ontonotes_loader import Ontonotes as _Ontonotes, OntonotesSentence, \
    make_coref_instance

from hanlp.common.dataset import TransformableDataset
from hanlp.utils.io_util import TimingFileIterator


class Ontonotes(_Ontonotes):
    def dataset_document_iterator(self, file_path: str) -> Iterator[List[OntonotesSentence]]:
        """An iterator over CONLL formatted files which yields documents, regardless
        of the number of document annotations in a particular file. This is useful
        for conll data which has been preprocessed, such as the preprocessing which
        takes place for the 2012 CONLL Coreference Resolution task.

        Args:
          file_path: str: 

        Returns:

        """
        open_file = TimingFileIterator(file_path)
        conll_rows = []
        document: List[OntonotesSentence] = []
        for line in open_file:
            open_file.log(f'Loading {os.path.basename(file_path)}')
            line = line.strip()
            if line != "" and not line.startswith("#"):
                # Non-empty line. Collect the annotation.
                conll_rows.append(line)
            else:
                if conll_rows:
                    document.append(self._conll_rows_to_sentence(conll_rows))
                    conll_rows = []
            if line.startswith("#end document"):
                yield document
                document = []
        open_file.erase()
        if document:
            # Collect any stragglers or files which might not
            # have the '#end document' format for the end of the file.
            yield document


class CONLL12CorefDataset(TransformableDataset):

    def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None,
                 max_span_width=10, max_sentences=None, remove_singleton_clusters=False) -> None:
        self.remove_singleton_clusters = remove_singleton_clusters
        self.max_sentences = max_sentences
        self.max_span_width = max_span_width
        super().__init__(data, transform, cache)

    def load_file(self, filepath: str):
        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(filepath):
            clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append((start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            yield self.text_to_instance([s.words for s in sentences], list(clusters.values()))

    def text_to_instance(
            self,  # type: ignore
            sentences: List[List[str]],
            gold_clusters: Optional[List[List[Tuple[int, int]]]] = None,
    ) -> dict:
        return make_coref_instance(
            sentences,
            self.max_span_width,
            gold_clusters,
            self.max_sentences,
            self.remove_singleton_clusters,
        )


================================================
FILE: hanlp/datasets/eos/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-26 18:11

================================================
FILE: hanlp/datasets/eos/eos.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-26 18:12
import itertools
from collections import Counter
from typing import Union, List, Callable

from hanlp.common.dataset import TransformableDataset
from hanlp.utils.io_util import TimingFileIterator
from hanlp.utils.log_util import cprint
from hanlp.utils.string_util import ispunct


class SentenceBoundaryDetectionDataset(TransformableDataset):

    def __init__(self,
                 data: Union[str, List],
                 transform: Union[Callable, List] = None,
                 cache=None,
                 append_after_sentence=None,
                 eos_chars=None,
                 eos_char_min_freq=200,
                 eos_char_is_punct=True,
                 window_size=5,
                 **kwargs,
                 ) -> None:
        """Dataset for sentence boundary detection (eos).

        Args:
            data: The local or remote path to a dataset, or a list of samples where each sample is a dict.
            transform: Predefined transform(s).
            cache: ``True`` to enable caching, so that transforms won't be called twice.
            append_after_sentence: A :class:`str` to insert at the tail of each sentence. For example, English always
                have a space between sentences.
            eos_chars: Punctuations at the tail of sentences. If ``None``, then it will built from training samples.
            eos_char_min_freq: Minimal frequency to keep a eos char.
            eos_char_is_punct: Limit eos chars to punctuations.
            window_size: Window size to extract ngram features.
            kwargs: Not used.
        """
        self.eos_char_is_punct = eos_char_is_punct
        self.append_after_sentence = append_after_sentence
        self.window_size = window_size
        self.eos_chars = eos_chars
        self.eos_char_min_freq = eos_char_min_freq
        super().__init__(data, transform, cache)

    def load_file(self, filepath: str):
        """Load eos corpus.

        Args:
            filepath: Path to the corpus.

        .. highlight:: bash
        .. code-block:: bash

            $ head -n 2 ctb8.txt
            中国经济简讯
            新华社北京十月二十九日电中国经济简讯

        """
        f = TimingFileIterator(filepath)
        sents = []
        eos_offsets = []
        offset = 0
        for line in f:
            if not line.strip():
                continue
            line = line.rstrip('\n')
            eos_offsets.append(offset + len(line.rstrip()) - 1)
            offset += len(line)
            if self.append_after_sentence:
                line += self.append_after_sentence
                offset += len(self.append_after_sentence)
            f.log(line)
            sents.append(line)
        f.erase()
        corpus = list(itertools.chain.from_iterable(sents))

        if self.eos_chars:
            if not isinstance(self.eos_chars, set):
                self.eos_chars = set(self.eos_chars)
        else:
            eos_chars = Counter()
            for i in eos_offsets:
                eos_chars[corpus[i]] += 1
            self.eos_chars = set(k for (k, v) in eos_chars.most_common() if
                                 v >= self.eos_char_min_freq and (not self.eos_char_is_punct or ispunct(k)))
            cprint(f'eos_chars = [yellow]{self.eos_chars}[/yellow]')

        eos_index = 0
        eos_offsets = [i for i in eos_offsets if corpus[i] in self.eos_chars]
        window_size = self.window_size
        for i, c in enumerate(corpus):
            if c in self.eos_chars:
                window = corpus[i - window_size: i + window_size + 1]
                label_id = 1. if eos_offsets[eos_index] == i else 0.
                if label_id > 0:
                    eos_index += 1
                yield {'char': window, 'label_id': label_id}
        assert eos_index == len(eos_offsets), f'{eos_index} != {len(eos_offsets)}'


================================================
FILE: hanlp/datasets/eos/loaders/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:03


================================================
FILE: hanlp/datasets/eos/loaders/nn_eos.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-24 22:51
_SETIMES2_EN_HR_SENTENCES_HOME = 'https://schweter.eu/cloud/nn_eos/SETIMES2.en-hr.sentences.tar.xz'
SETIMES2_EN_HR_HR_SENTENCES_TRAIN = _SETIMES2_EN_HR_SENTENCES_HOME + '#SETIMES2.en-hr.hr.sentences.train'
'''Training set of SETimes corpus.'''
SETIMES2_EN_HR_HR_SENTENCES_DEV = _SETIMES2_EN_HR_SENTENCES_HOME + '#SETIMES2.en-hr.hr.sentences.dev'
'''Dev set of SETimes corpus.'''
SETIMES2_EN_HR_HR_SENTENCES_TEST = _SETIMES2_EN_HR_SENTENCES_HOME + '#SETIMES2.en-hr.hr.sentences.test'
'''Test set of SETimes corpus.'''
_EUROPARL_V7_DE_EN_EN_SENTENCES_HOME = 'http://schweter.eu/cloud/nn_eos/europarl-v7.de-en.en.sentences.tar.xz'
EUROPARL_V7_DE_EN_EN_SENTENCES_TRAIN = _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME + '#europarl-v7.de-en.en.sentences.train'
'''Training set of Europarl corpus (:cite:`koehn2005europarl`).'''
EUROPARL_V7_DE_EN_EN_SENTENCES_DEV = _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME + '#europarl-v7.de-en.en.sentences.dev'
'''Dev set of Europarl corpus (:cite:`koehn2005europarl`).'''
EUROPARL_V7_DE_EN_EN_SENTENCES_TEST = _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME + '#europarl-v7.de-en.en.sentences.test'
'''Test set of Europarl corpus (:cite:`koehn2005europarl`).'''


================================================
FILE: hanlp/datasets/lm/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-05 21:41

_PTB_HOME = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz#'
PTB_TOKEN_TRAIN = _PTB_HOME + 'data/ptb.train.txt'
PTB_TOKEN_DEV = _PTB_HOME + 'data/ptb.valid.txt'
PTB_TOKEN_TEST = _PTB_HOME + 'data/ptb.test.txt'

PTB_CHAR_TRAIN = _PTB_HOME + 'data/ptb.char.train.txt'
PTB_CHAR_DEV = _PTB_HOME + 'data/ptb.char.valid.txt'
PTB_CHAR_TEST = _PTB_HOME + 'data/ptb.char.test.txt'


================================================
FILE: hanlp/datasets/lm/loaders/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:04


================================================
FILE: hanlp/datasets/lm/loaders/lm_dataset.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-05 21:42
import os
from typing import Union, Callable, List

import hanlp_common.io
import torch

from hanlp.common.dataset import TransformSequentialDataset
from hanlp.common.transform import ToChar, WhitespaceTokenizer, AppendEOS, FieldToIndex
from hanlp.common.vocab import Vocab
from hanlp.utils.io_util import file_cache, get_resource, TimingFileIterator


class LanguageModelDataset(TransformSequentialDataset):

    def __init__(self,
                 data: str,
                 batch_size,
                 seq_len,
                 tokenizer='char',
                 eos='\n',
                 strip=True,
                 vocab=None,
                 cache=False,
                 transform: Union[Callable, List] = None) -> None:
        self.cache = cache
        self.eos = eos
        self.strip = strip
        super().__init__(transform)
        if isinstance(tokenizer, str):
            available_tokenizers = {
                'char': ToChar('text', 'token'),
                'whitespace': WhitespaceTokenizer('text', 'token')
            }
            assert tokenizer in available_tokenizers, f'{tokenizer} not supported, available options: {available_tokenizers.keys()} '
            self.append_transform(available_tokenizers[tokenizer])

        if vocab is None:
            vocab = Vocab()
            self.training = True
        else:
            self.training = vocab.mutable
        self.append_transform(AppendEOS('token', eos=eos))
        self.append_transform(FieldToIndex('token', vocab))
        self.batch_size = batch_size
        data = get_resource(data)
        self.data = data
        self.num_tokens = None
        self.load_file(data)
        self._fp = None
        if isinstance(seq_len, int):
            self.seq_len = lambda: seq_len
        else:
            self.seq_len = seq_len

    @property
    def vocab(self):
        return self.transform[-1].vocab

    @property
    def vocab_path(self):
        return os.path.splitext(self.data)[0] + '.vocab.json'

    def load_file(self, filepath):
        cache, valid = file_cache(filepath, not self.cache)
        if not valid or (self.vocab.mutable and not os.path.isfile(self.vocab_path)):
            with open(cache, 'wb') as out:
                tokens, lines = 0, 0
                f = TimingFileIterator(filepath)
                for line in f:
                    if self.strip:
                        line = line.strip()
                        if not line:
                            continue
                    sample = {'text': line}
                    sample = self.transform_sample(sample, inplace=True)
                    for id in sample['token_id']:
                        out.write((id).to_bytes(4, 'little'))
                    tokens += len(sample['token_id'])
                    lines += 1
                    f.log(f'{tokens // 1000000}M tokens, {lines // 1000000}M lines\n'
                          f'{sample["token"][:10]}')
                f.erase()
                if self.vocab.mutable:
                    self.vocab.lock()
                    hanlp_common.io.save_json(self.vocab_path)
                self.num_tokens = tokens
        else:
            self.num_tokens = int(os.path.getsize(self.filecache) / 4)
            if self.vocab.mutable:
                hanlp_common.io.load_json(self.vocab_path)

    def __iter__(self):
        batch_size = self.batch_size
        max_seq_len = self.max_seq_len
        i = 0
        safety = 2 if self.training else 1
        with open(self.filecache, 'rb') as fp:
            while i < max_seq_len - safety:
                seq_len = self.seq_len()
                seq_len = min(seq_len, max_seq_len - 1 - i)
                data = []
                for j in range(batch_size):
                    data.append(self._read_chunk(fp, max_seq_len * j + i, seq_len + 1))
                data = torch.LongTensor(data)
                data.transpose_(0, 1)
                data, targets = data[:seq_len, :], data[1:, :]
                yield data, targets.contiguous().view(-1)
                i += seq_len

    def estimate_num_batches(self, seq_len=None):
        if not seq_len:
            seq_len = self.seq_len()
        return self.max_seq_len // seq_len

    @property
    def max_seq_len(self):
        max_seq_len = self.num_tokens // self.batch_size
        return max_seq_len

    @staticmethod
    def _read_chunk(fp, offset, length):
        data = []
        fp.seek(offset * 4)
        for i in range(length):
            id = int.from_bytes(fp.read(4), 'little')
            data.append(id)
        return data

    def _debug_load_cache(self):
        with open(self.filecache, 'rb') as src:
            ids = []
            for i in range(self.num_tokens):
                id = int.from_bytes(src.read(4), 'little')
                ids.append(id)
            return torch.LongTensor(ids)

    @property
    def filecache(self):
        return file_cache(self.data)[0]


================================================
FILE: hanlp/datasets/lu/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:08


================================================
FILE: hanlp/datasets/lu/glue.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-11-10 11:47
from hanlp.common.dataset import TableDataset

STANFORD_SENTIMENT_TREEBANK_2_TRAIN = 'http://file.hankcs.com/corpus/SST2.zip#train.tsv'
STANFORD_SENTIMENT_TREEBANK_2_DEV = 'http://file.hankcs.com/corpus/SST2.zip#dev.tsv'
STANFORD_SENTIMENT_TREEBANK_2_TEST = 'http://file.hankcs.com/corpus/SST2.zip#test.tsv'

MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_TRAIN = 'http://file.hankcs.com/corpus/mrpc.zip#train.tsv'
MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_DEV = 'http://file.hankcs.com/corpus/mrpc.zip#dev.tsv'
MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_TEST = 'http://file.hankcs.com/corpus/mrpc.zip#test.tsv'


class SST2Dataset(TableDataset):
    pass


def main():
    dataset = SST2Dataset(STANFORD_SENTIMENT_TREEBANK_2_TEST)
    print(dataset)


if __name__ == '__main__':
    main()


================================================
FILE: hanlp/datasets/ner/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-06 15:32

================================================
FILE: hanlp/datasets/ner/conll03.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-06 15:31


CONLL03_EN_TRAIN = 'https://file.hankcs.com/corpus/conll03_en_iobes.zip#eng.train.tsv'
'''Training set of CoNLL03 (:cite:`tjong-kim-sang-de-meulder-2003-introduction`)'''
CONLL03_EN_DEV = 'https://file.hankcs.com/corpus/conll03_en_iobes.zip#eng.dev.tsv'
'''Dev set of CoNLL03 (:cite:`tjong-kim-sang-de-meulder-2003-introduction`)'''
CONLL03_EN_TEST = 'https://file.hankcs.com/corpus/conll03_en_iobes.zip#eng.test.tsv'
'''Test set of CoNLL03 (:cite:`tjong-kim-sang-de-meulder-2003-introduction`)'''


================================================
FILE: hanlp/datasets/ner/loaders/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:04


================================================
FILE: hanlp/datasets/ner/loaders/json_ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-21 16:26
import json
import os
from typing import Union, List, Callable, Dict

from hanlp_common.constant import NULL

import hanlp.utils.span_util
from hanlp.common.dataset import TransformableDataset
from hanlp.utils.io_util import TimingFileIterator, read_tsv_as_sents


class JsonNERDataset(TransformableDataset):

    def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None,
                 generate_idx=None, doc_level_offset=True, tagset=None) -> None:
        """A dataset for ``.jsonlines`` format NER corpora.

        Args:
            data: The local or remote path to a dataset, or a list of samples where each sample is a dict.
            transform: Predefined transform(s).
            cache: ``True`` to enable caching, so that transforms won't be called twice.
            generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when
                samples are re-ordered by a sampler.
            doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level.
            tagset: Optional tagset to prune entities outside of this tagset from datasets.
        """
        self.tagset = tagset
        self.doc_level_offset = doc_level_offset
        super().__init__(data, transform, cache, generate_idx)

    def load_file(self, filepath: str):
        """Load ``.jsonlines`` NER corpus. Samples of this corpus can be found using the following scripts.

        .. highlight:: python
        .. code-block:: python

            import json
            from hanlp_common.document import Document
            from hanlp.datasets.srl.ontonotes5.chinese import ONTONOTES5_CONLL12_CHINESE_DEV
            from hanlp.utils.io_util import get_resource

            with open(get_resource(ONTONOTES5_CONLL12_CHINESE_DEV)) as src:
                for line in src:
                    doc = json.loads(line)
                    print(Document(doc))
                    break

        Args:
            filepath: ``.jsonlines`` NER corpus.
        """
        filename = os.path.basename(filepath)
        reader = TimingFileIterator(filepath)
        num_docs, num_sentences = 0, 0
        for line in reader:
            line = line.strip()
            if not line:
                continue
            doc = json.loads(line)
            num_docs += 1
            num_tokens_in_doc = 0
            for sentence, ner in zip(doc['sentences'], doc['ner']):
                if self.doc_level_offset:
                    ner = [(x[0] - num_tokens_in_doc, x[1] - num_tokens_in_doc, x[2]) for x in ner]
                else:
                    ner = [(x[0], x[1], x[2]) for x in ner]
                if self.tagset:
                    ner = [x for x in ner if x[2] in self.tagset]
                    if isinstance(self.tagset, dict):
                        ner = [(x[0], x[1], self.tagset[x[2]]) for x in ner]
                deduplicated_srl = []
                be_set = set()
                for b, e, l in ner:
                    be = (b, e)
                    if be in be_set:
                        continue
                    be_set.add(be)
                    deduplicated_srl.append((b, e, l))
                yield {
                    'token': sentence,
                    'ner': deduplicated_srl
                }
                num_sentences += 1
                num_tokens_in_doc += len(sentence)
            reader.log(
                f'{filename} {num_docs} documents, {num_sentences} sentences [blink][yellow]...[/yellow][/blink]')
        reader.erase()


def convert_conll03_to_json(file_path):
    dataset = []
    num_docs = [0]

    def new_doc():
        doc_key = num_docs[0]
        num_docs[0] += 1
        return {
            'doc_key': doc_key,
            'sentences': [],
            'ner': [],
        }

    doc = new_doc()
    offset = 0
    for cells in read_tsv_as_sents(file_path):
        if cells[0][0] == '-DOCSTART-' and doc['ner']:
            dataset.append(doc)
            doc = new_doc()
            offset = 0
        sentence = [x[0] for x in cells]
        ner = [x[-1] for x in cells]
        ner = hanlp.utils.span_util.iobes_tags_to_spans(ner)
        adjusted_ner = []
        for label, (span_start, span_end) in ner:
            adjusted_ner.append([span_start + offset, span_end + offset, label])
        doc['sentences'].append(sentence)
        doc['ner'].append(adjusted_ner)
        offset += len(sentence)
    if doc['ner']:
        dataset.append(doc)
    output_path = os.path.splitext(file_path)[0] + '.json'
    with open(output_path, 'w') as out:
        for each in dataset:
            json.dump(each, out)
            out.write('\n')


def unpack_ner(sample: dict) -> dict:
    ner: list = sample.get('ner', None)
    if ner is not None:
        if ner:
            sample['begin_offset'], sample['end_offset'], sample['label'] = zip(*ner)
        else:
            # It's necessary to create a null label when there is no NER in the sentence for the sake of padding.
            sample['begin_offset'], sample['end_offset'], sample['label'] = [0], [0], [NULL]
    return sample


def prune_ner_tagset(sample: dict, tagset: Union[set, Dict[str, str]]):
    if 'tag' in sample:
        pruned_tag = []
        for tag in sample['tag']:
            cells = tag.split('-', 1)
            if len(cells) == 2:
                role, ner_type = cells
                if ner_type in tagset:
                    if isinstance(tagset, dict):
                        tag = role + '-' + tagset[ner_type]
                else:
                    tag = 'O'
            pruned_tag.append(tag)
        sample['tag'] = pruned_tag
    return sample


================================================
FILE: hanlp/datasets/ner/loaders/tsv.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-24 23:09
from typing import Union, List, Callable

from hanlp.common.dataset import TransformableDataset
from hanlp.utils.io_util import get_resource, generate_words_tags_from_tsv
from hanlp.utils.string_util import split_long_sentence_into


class TSVTaggingDataset(TransformableDataset):

    def __init__(self,
                 data: Union[str, List],
                 transform: Union[Callable, List] = None,
                 cache=None,
                 generate_idx=None,
                 max_seq_len=None,
                 sent_delimiter=None,
                 char_level=False,
                 hard_constraint=False,
                 **kwargs
                 ) -> None:
        """

        Args:
            data: The local or remote path to a dataset, or a list of samples where each sample is a dict.
            transform: Predefined transform(s).
            cache: ``True`` to enable caching, so that transforms won't be called twice.
            generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when
                samples are re-ordered by a sampler.
            max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
            sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
                be split here.
            char_level: Whether the sequence length is measured at char level, which is never the case for
                lemmatization.
            hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
                in a sentence, it will be split at a token anyway.
            kwargs: Not used.
        """
        self.char_level = char_level
        self.hard_constraint = hard_constraint
        self.sent_delimiter = sent_delimiter
        self.max_seq_len = max_seq_len
        super().__init__(data, transform, cache, generate_idx)

    def load_file(self, filepath):
        """Load a ``.tsv`` file. A ``.tsv`` file for tagging is defined as a tab separated text file, where non-empty
        lines have two columns for token and tag respectively, empty lines mark the end of sentences.

        Args:
            filepath: Path to a ``.tsv`` tagging file.

        .. highlight:: bash
        .. code-block:: bash

            $ head eng.train.tsv
            -DOCSTART-      O

            EU      S-ORG
            rejects O
            German  S-MISC
            call    O
            to      O
            boycott O
            British S-MISC
            lamb    O

        """
        filepath = get_resource(filepath)
        # idx = 0
        for words, tags in generate_words_tags_from_tsv(filepath, lower=False):
            # idx += 1
            # if idx % 1000 == 0:
            #     print(f'\rRead instances {idx // 1000}k', end='')
            if self.max_seq_len:
                start = 0
                for short_sents in split_long_sentence_into(words, self.max_seq_len, self.sent_delimiter,
                                                            char_level=self.char_level,
                                                            hard_constraint=self.hard_constraint):
                    end = start + len(short_sents)
                    yield {'token': short_sents, 'tag': tags[start:end]}
                    start = end
            else:
                yield {'token': words, 'tag': tags}
        # print('\r', end='')


================================================
FILE: hanlp/datasets/ner/msra.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 23:13

_MSRA_NER_HOME = 'http://file.hankcs.com/corpus/msra_ner.zip'
_MSRA_NER_TOKEN_LEVEL_HOME = 'http://file.hankcs.com/corpus/msra_ner_token_level.zip'

MSRA_NER_CHAR_LEVEL_TRAIN = f'{_MSRA_NER_HOME}#train.tsv'
'''Training set of MSRA (:cite:`levow-2006-third`) in character level.'''
MSRA_NER_CHAR_LEVEL_DEV = f'{_MSRA_NER_HOME}#dev.tsv'
'''Dev set of MSRA (:cite:`levow-2006-third`) in character level.'''
MSRA_NER_CHAR_LEVEL_TEST = f'{_MSRA_NER_HOME}#test.tsv'
'''Test set of MSRA (:cite:`levow-2006-third`) in character level.'''

MSRA_NER_TOKEN_LEVEL_IOBES_TRAIN = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.train.tsv'
'''Training set of MSRA (:cite:`levow-2006-third`) in token level.'''
MSRA_NER_TOKEN_LEVEL_IOBES_DEV = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.dev.tsv'
'''Dev set of MSRA (:cite:`levow-2006-third`) in token level.'''
MSRA_NER_TOKEN_LEVEL_IOBES_TEST = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.test.tsv'
'''Test set of MSRA (:cite:`levow-2006-third`) in token level.'''

MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TRAIN = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.train.short.tsv'
'''Training set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level.'''
MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_DEV = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.dev.short.tsv'
'''Dev set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level.'''
MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TEST = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.test.short.tsv'
'''Test set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level.'''

MSRA_NER_TOKEN_LEVEL_SHORT_JSON_TRAIN = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.train.short.jsonlines'
'''Training set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level and jsonlines format.'''
MSRA_NER_TOKEN_LEVEL_SHORT_JSON_DEV = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.dev.short.jsonlines'
'''Dev set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level and jsonlines format.'''
MSRA_NER_TOKEN_LEVEL_SHORT_JSON_TEST = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.test.short.jsonlines'
'''Test set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level and jsonlines format.'''


================================================
FILE: hanlp/datasets/ner/resume.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-08 12:10
from hanlp.common.dataset import TransformableDataset

from hanlp.utils.io_util import get_resource, generate_words_tags_from_tsv

_RESUME_NER_HOME = 'https://github.com/jiesutd/LatticeLSTM/archive/master.zip#'

RESUME_NER_TRAIN = _RESUME_NER_HOME + 'ResumeNER/train.char.bmes'
'''Training set of Resume in char level.'''
RESUME_NER_DEV = _RESUME_NER_HOME + 'ResumeNER/dev.char.bmes'
'''Dev set of Resume in char level.'''
RESUME_NER_TEST = _RESUME_NER_HOME + 'ResumeNER/test.char.bmes'
'''Test set of Resume in char level.'''


================================================
FILE: hanlp/datasets/ner/weibo.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-03 23:33
from hanlp.common.dataset import TransformableDataset

from hanlp.utils.io_util import get_resource, generate_words_tags_from_tsv

_WEIBO_NER_HOME = 'https://github.com/hltcoe/golden-horse/archive/master.zip#data/'

WEIBO_NER_TRAIN = _WEIBO_NER_HOME + 'weiboNER_2nd_conll.train'
'''Training set of Weibo in char level.'''
WEIBO_NER_DEV = _WEIBO_NER_HOME + 'weiboNER_2nd_conll.dev'
'''Dev set of Weibo in char level.'''
WEIBO_NER_TEST = _WEIBO_NER_HOME + 'weiboNER_2nd_conll.test'
'''Test set of Weibo in char level.'''


================================================
FILE: hanlp/datasets/parsing/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 00:51


================================================
FILE: hanlp/datasets/parsing/amr.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-18 17:47
from collections import defaultdict
from copy import copy
from typing import List

import numpy as np
import torch


from hanlp_common.constant import CLS
from hanlp.common.dataset import TransformableDataset, PadSequenceDataLoader
from hanlp.common.transform import VocabDict
from hanlp.common.vocab import VocabWithFrequency
from hanlp.components.amr.amr_parser.amrio import AMRIO
from hanlp.components.amr.amr_parser.data import END, DUM, list_to_tensor, lists_of_string_to_tensor, NIL, REL
from hanlp.components.amr.amr_parser.transformer import SelfAttentionMask
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
from hanlp_common.util import merge_list_of_dict


class AbstractMeaningRepresentationDataset(TransformableDataset):
    def load_file(self, filepath: str):
        for tok, lem, pos, ner, amr in AMRIO.read(filepath):
            yield {'token': tok, 'lemma': lem, 'pos': pos, 'ner': ner, 'amr': amr}


def generate_oracle(sample: dict):
    amr = sample.get('amr', None)
    if amr:
        concept, edge, _ = amr.root_centered_sort()
        sample['concept'] = concept
        sample['edge'] = edge
    return sample


def chars_for_tok(sample: dict, max_string_len=20):
    token = sample['token']
    chars = []
    for each in token:
        each = each[:max_string_len]
        chars.append([CLS] + list(each) + [END])
    sample['word_char'] = chars
    return sample


def append_bos(sample: dict):
    for key in ['token', 'lemma', 'pos', 'ner']:
        if key in sample:
            sample[key] = [CLS] + sample[key]
    return sample


def get_concepts(sample: dict, vocab: VocabWithFrequency = None, rel_vocab: VocabWithFrequency = None):
    lem, tok = sample['lemma'], sample['token']
    cp_seq, mp_seq = [], []
    new_tokens = set()
    for le, to in zip(lem, tok):
        cp_seq.append(le + '_')
        mp_seq.append(le)

    for cp, mp in zip(cp_seq, mp_seq):
        if vocab.get_idx(cp) == vocab.unk_idx:
            new_tokens.add(cp)
        if vocab.get_idx(mp) == vocab.unk_idx:
            new_tokens.add(mp)
    nxt = len(vocab)
    token2idx, idx2token = dict(), dict()
    if rel_vocab:
        new_tokens = rel_vocab.idx_to_token + sorted(new_tokens)
    else:
        new_tokens = sorted(new_tokens)
    for x in new_tokens:
        token2idx[x] = nxt
        idx2token[nxt] = x
        nxt += 1
    for k, v in zip(['cp_seq', 'mp_seq', 'token2idx', 'idx2token'], [cp_seq, mp_seq, token2idx, idx2token]):
        sample[k] = v
    return sample


def batchify(data, vocabs: VocabDict, unk_rate=0., device=None, squeeze=False,
             tokenizer: TransformerSequenceTokenizer = None, shuffle_sibling=True,
             levi_graph=False, extra_arc=False, bart=False):
    rel_vocab: VocabWithFrequency = vocabs.rel
    _tok = list_to_tensor(data['token'], vocabs['token'], unk_rate=unk_rate) if 'token' in vocabs else None
    _lem = list_to_tensor(data['lemma'], vocabs['lemma'], unk_rate=unk_rate)
    _pos = list_to_tensor(data['pos'], vocabs['pos'], unk_rate=unk_rate) if 'pos' in vocabs else None
    _ner = list_to_tensor(data['ner'], vocabs['ner'], unk_rate=unk_rate) if 'ner' in vocabs else None
    _word_char = lists_of_string_to_tensor(data['token'], vocabs['word_char']) if 'word_char' in vocabs else None

    local_token2idx = data['token2idx']
    local_idx2token = data['idx2token']
    _cp_seq = list_to_tensor(data['cp_seq'], vocabs['predictable_concept'], local_token2idx)
    _mp_seq = list_to_tensor(data['mp_seq'], vocabs['predictable_concept'], local_token2idx)

    ret = copy(data)
    if 'amr' in data:
        concept, edge = [], []
        for amr in data['amr']:
            if levi_graph == 'kahn':
                concept_i, edge_i = amr.to_levi(rel_vocab.get_frequency, shuffle=shuffle_sibling)
            else:
                concept_i, edge_i, _ = amr.root_centered_sort(rel_vocab.get_frequency, shuffle=shuffle_sibling)
            concept.append(concept_i)
            edge.append(edge_i)
        if levi_graph is True:
            concept_with_rel, edge_with_rel = levi_amr(concept, edge, extra_arc=extra_arc)
            concept = concept_with_rel
            edge = edge_with_rel

        augmented_concept = [[DUM] + x + [END] for x in concept]

        _concept_in = list_to_tensor(augmented_concept, vocabs.get('concept_and_rel', vocabs['concept']),
                                     unk_rate=unk_rate)[:-1]
        _concept_char_in = lists_of_string_to_tensor(augmented_concept, vocabs['concept_char'])[:-1]
        _concept_out = list_to_tensor(augmented_concept, vocabs['predictable_concept'], local_token2idx)[1:]

        out_conc_len, bsz = _concept_out.shape
        _rel = np.full((1 + out_conc_len, bsz, out_conc_len), rel_vocab.pad_idx)
        # v: [<dummy>, concept_0, ..., concept_l, ..., concept_{n-1}, <end>] u: [<dummy>, concept_0, ..., concept_l, ..., concept_{n-1}]

        for bidx, (x, y) in enumerate(zip(edge, concept)):
            for l, _ in enumerate(y):
                if l > 0:
                    # l=1 => pos=l+1=2
                    _rel[l + 1, bidx, 1:l + 1] = rel_vocab.get_idx(NIL)
            for v, u, r in x:
                if levi_graph:
                    r = 1
                else:
                    r = rel_vocab.get_idx(r)
                assert v > u, 'Invalid typological order'
                _rel[v + 1, bidx, u + 1] = r
        ret.update(
            {'concept_in': _concept_in, 'concept_char_in': _concept_char_in, 'concept_out': _concept_out, 'rel': _rel})
    else:
        augmented_concept = None

    token_length = ret.get('token_length', None)
    if token_length is not None and not isinstance(token_length, torch.Tensor):
        ret['token_length'] = torch.tensor(token_length, dtype=torch.long, device=device if (
                    isinstance(device, torch.device) or device >= 0) else 'cpu:0')
    ret.update({'lem': _lem, 'tok': _tok, 'pos': _pos, 'ner': _ner, 'word_char': _word_char,
                'copy_seq': np.stack([_cp_seq, _mp_seq], -1), 'local_token2idx': local_token2idx,
                'local_idx2token': local_idx2token})
    if squeeze:
        token_field = make_batch_for_squeeze(data, augmented_concept, tokenizer, device, ret)
    else:
        token_field = 'token'
    subtoken_to_tensor(token_field, ret)
    if bart:
        make_batch_for_bart(augmented_concept, ret, tokenizer, device)
    move_dict_to_device(ret, device)

    return ret


def make_batch_for_bart(augmented_concept, ret, tokenizer, device, training=True):
    token_field = 'concept'
    tokenizer = TransformerSequenceTokenizer(tokenizer.tokenizer, token_field, cls_is_bos=True, sep_is_eos=None)
    encodings = [tokenizer({token_field: x[:-1] if training else x}) for x in augmented_concept]
    ret.update(merge_list_of_dict(encodings))
    decoder_mask = []
    max_seq_len = len(max(ret['concept_input_ids'], key=len))
    last_concept_offset = []
    for spans, concepts in zip(ret['concept_token_span'], augmented_concept):
        mask = ~SelfAttentionMask.get_mask(max_seq_len, device, ret_parameter=False)
        for group in spans:
            for i in range(len(group)):
                for j in range(i + 1, len(group)):
                    mask[group[i], group[j]] = True
        decoder_mask.append(mask)
        last_concept_offset.append(len(concepts) - 1)
    ret['decoder_mask'] = torch.stack(decoder_mask)
    if not training:
        ret['last_concept_offset'] = torch.tensor(last_concept_offset, device=device, dtype=torch.long)
    subtoken_to_tensor(token_field, ret)


def levi_amr(concept, edge, extra_arc=False):
    concept_with_rel = []
    edge_with_rel = []
    for bidx, (edge_i, concept_i) in enumerate(zip(edge, concept)):
        concept_i, edge_i = linearize(concept_i, edge_i, NIL, prefix=REL, extra_arc=extra_arc)
        # This is a undirectional graph, so we can safely reverse edge
        edge_i = [tuple(reversed(sorted(x[:2]))) + x[2:] for x in edge_i]
        concept_with_rel.append(concept_i)
        edge_with_rel.append(edge_i)
    return concept_with_rel, edge_with_rel


def move_dict_to_device(ret, device):
    if device == -1:
        device = 'cpu:0'
    for k, v in ret.items():
        if isinstance(v, np.ndarray):
            ret[k] = torch.tensor(v, device=device).contiguous()
        elif isinstance(v, torch.Tensor):
            ret[k] = v.to(device).contiguous()


def subtoken_to_tensor(token_field, ret):
    token_input_ids = PadSequenceDataLoader.pad_data(ret[f'{token_field}_input_ids'], 0, torch.long)
    token_token_span = PadSequenceDataLoader.pad_data(ret[f'{token_field}_token_span'], 0, torch.long)
    ret.update({f'{token_field}_token_span': token_token_span, f'{token_field}_input_ids': token_input_ids})


def make_batch_for_squeeze(data, augmented_concept, tokenizer, device, ret):
    token_field = 'token_and_concept'
    attention_mask = []
    token_and_concept = [t + [tokenizer.sep_token] + c for t, c in zip(data['token'], augmented_concept)]
    encodings = [tokenizer({token_field: x}) for x in token_and_concept]
    ret.update(merge_list_of_dict(encodings))
    max_input_len = len(max(ret[f'{token_field}_input_ids'], key=len))
    concept_mask = []
    token_mask = []
    token_type_ids = []
    snt_len = []
    last_concept_offset = []
    for tokens, concepts, input_ids, spans in zip(data['token'], augmented_concept,
                                                  ret['token_and_concept_input_ids'],
                                                  ret['token_and_concept_token_span']):
        raw_sent_len = len(tokens) + 1  # for [SEP]
        raw_concept_len = len(concepts)
        if concepts[-1] == END:
            concept_mask.append([False] * raw_sent_len + [True] * (raw_concept_len - 1) + [False])  # skip END concept
        else:
            concept_mask.append([False] * raw_sent_len + [True] * raw_concept_len)
        token_mask.append([False] + [True] * (raw_sent_len - 2) + [False] * (raw_concept_len + 1))
        assert len(concept_mask) == len(token_mask)
        snt_len.append(raw_sent_len - 2)  # skip [CLS] and [SEP]
        sent_len = input_ids.index(tokenizer.tokenizer.sep_token_id) + 1
        concept_len = len(input_ids) - sent_len
        mask = torch.zeros((max_input_len, max_input_len), dtype=torch.bool)
        mask[:sent_len + concept_len, :sent_len] = True
        bottom_right = ~SelfAttentionMask.get_mask(concept_len, device, ret_parameter=False)
        mask[sent_len:sent_len + concept_len, sent_len:sent_len + concept_len] = bottom_right
        for group in spans:
            if group[0] >= sent_len:
                for i in range(len(group)):
                    for j in range(i + 1, len(group)):
                        mask[group[i], group[j]] = True
        attention_mask.append(mask)
        _token_type_ids = [0] * sent_len + [1] * concept_len
        token_type_ids.append(_token_type_ids)
        assert len(input_ids) == len(_token_type_ids)
        last_concept_offset.append(raw_concept_len - 1)
    ret['attention_mask'] = torch.stack(attention_mask)
    ret['concept_mask'] = PadSequenceDataLoader.pad_data(concept_mask, 0, torch.bool)
    ret['token_mask'] = PadSequenceDataLoader.pad_data(token_mask, 0, torch.bool)
    ret['token_type_ids'] = PadSequenceDataLoader.pad_data(token_type_ids, 0, torch.long)
    ret['snt_len'] = PadSequenceDataLoader.pad_data(snt_len, 0, torch.long)
    ret['last_concept_offset'] = PadSequenceDataLoader.pad_data(last_concept_offset, 0, torch.long)
    return token_field


def linearize(concept: List, edge: List, label='', prefix=REL, extra_arc=False):
    vur = defaultdict(dict)
    for v, u, r in edge:
        vur[v][u] = r
    concept_with_rel = []
    edge_with_rel = []
    reorder = dict()
    for v, c in enumerate(concept):
        reorder[v] = len(concept_with_rel)
        concept_with_rel.append(c)
        ur = vur[v]
        for u, r in ur.items():
            if u < v:
                concept_with_rel.append(prefix + r)
    for k, v in reorder.items():
        assert concept[k] == concept_with_rel[v]
    for v, c in enumerate(concept):
        ur = vur[v]
        for i, (u, r) in enumerate(ur.items()):
            if u < v:
                _v = reorder[v]
                _u = reorder[u]
                _m = _v + i + 1
                edge_with_rel.append((_v, _m, label))
                edge_with_rel.append((_m, _u, label))
                if extra_arc:
                    edge_with_rel.append((_v, _u, label))
    return concept_with_rel, edge_with_rel


def unlinearize(concept: List, edge: List, prefix=REL, extra_arc=False):
    real_concept, reorder = separate_concept_rel(concept, prefix)
    if extra_arc:
        edge = [x for x in edge if concept[x[0]].startswith(REL) or concept[x[1]].startswith(REL)]
    real_edge = []
    for f, b in zip(edge[::2], edge[1::2]):
        if b[1] not in reorder:
            continue
        u = reorder[b[1]]
        if f[0] not in reorder:
            continue
        v = reorder[f[0]]
        r = concept[f[1]][len(prefix):]
        real_edge.append((v, u, r))
    return real_concept, real_edge


def separate_concept_rel(concept, prefix=REL):
    reorder = dict()
    real_concept = []
    for i, c in enumerate(concept):
        if not c.startswith(prefix):
            reorder[i] = len(real_concept)
            real_concept.append(c)
    return real_concept, reorder


def remove_unconnected_components(concept: List, edge: List):
    from scipy.sparse import csr_matrix
    from scipy.sparse.csgraph._traversal import connected_components
    row = np.array([x[0] for x in edge], dtype=np.int)
    col = np.array([x[1] for x in edge], dtype=np.int)
    data = np.ones(len(row), dtype=np.int)
    graph = csr_matrix((data, (row, col)), shape=(len(concept), len(concept)))
    n_components, labels = connected_components(csgraph=graph, directed=True, return_labels=True)
    if n_components > 1:
        unique, counts = np.unique(labels, return_counts=True)
        largest_component = max(zip(counts, unique))[-1]
        connected_nodes = set(np.where(labels == largest_component)[0])
        reorder = dict()
        good_concept = []
        good_edge = []
        for i, c in enumerate(concept):
            if i in connected_nodes:
                reorder[i] = len(good_concept)
                good_concept.append(c)
        for v, u, r in edge:
            if v in connected_nodes and u in connected_nodes:
                good_edge.append((reorder[v], reorder[u], r))
        concept, edge = good_concept, good_edge
    return concept, edge


def largest_connected_component(triples: List):
    node_to_id = dict()
    concept = []
    edge = []
    for u, r, v in triples:
        if u not in node_to_id:
            node_to_id[u] = len(node_to_id)
            concept.append(u)
        if v not in node_to_id:
            node_to_id[v] = len(node_to_id)
            concept.append(v)
        edge.append((node_to_id[u], node_to_id[v], r))
    concept, edge = remove_unconnected_components(concept, edge)
    return concept, edge


def to_triples(concept: List, edge: List):
    return [(concept[u], r, concept[v]) for u, v, r in edge]


def reverse_edge_for_levi_bfs(concept, edge):
    for v, u, r in edge:
        if r == '_reverse_':
            for x in v, u:
                if concept[x].startswith(REL) and not concept[x].endswith('_reverse_'):
                    concept[x] += '_reverse_'


def un_kahn(concept, edge):
    # (['want', 'rel=ARG1', 'rel=ARG0', 'believe', 'rel=ARG1', 'rel=ARG0', 'boy', 'girl'],
    # [(0, 1, 0.9999417066574097), (0, 2, 0.9999995231628418), (1, 3, 0.9999992847442627), (3, 4, 1.0), (3, 5, 0.9999996423721313), (2, 6, 0.9996106624603271), (4, 6, 0.9999767541885376), (5, 7, 0.9999860525131226)])
    real_concept, reorder = separate_concept_rel(concept)
    tri_edge = dict()
    for m, (a, b, p1) in enumerate(edge):
        if concept[a].startswith(REL):
            continue
        for n, (c, d, p2) in enumerate(edge[m + 1:]):
            if b == c:
                key = (a, d)
                _, p = tri_edge.get(key, (None, 0))
                if p1 * p2 > p:
                    tri_edge[key] = (b, p1 * p2)
    real_edge = []
    for (a, d), (r, p) in tri_edge.items():
        u = reorder[a]
        r = concept[r][len(REL):]
        v = reorder[d]
        real_edge.append((v, u, r))
    return real_concept, real_edge


================================================
FILE: hanlp/datasets/parsing/ctb5.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 18:44
from hanlp_common.constant import HANLP_URL

_CTB_HOME = HANLP_URL + 'embeddings/SUDA-LA-CIP_20200109_021624.zip#'

_CTB5_DEP_HOME = _CTB_HOME + 'BPNN/data/ctb5/'

CTB5_DEP_TRAIN = _CTB5_DEP_HOME + 'train.conll'
'''Training set for ctb5 dependency parsing.'''
CTB5_DEP_DEV = _CTB5_DEP_HOME + 'dev.conll'
'''Dev set for ctb5 dependency parsing.'''
CTB5_DEP_TEST = _CTB5_DEP_HOME + 'test.conll'
'''Test set for ctb5 dependency parsing.'''

CIP_W2V_100_CN = _CTB_HOME + 'BPNN/data/embed.txt'


================================================
FILE: hanlp/datasets/parsing/ctb7.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 18:44
from hanlp.datasets.parsing.ctb5 import _CTB_HOME

_CTB7_HOME = _CTB_HOME + 'BPNN/data/ctb7/'

CTB7_DEP_TRAIN = _CTB7_HOME + 'train.conll'
'''Training set for ctb7 dependency parsing.'''
CTB7_DEP_DEV = _CTB7_HOME + 'dev.conll'
'''Dev set for ctb7 dependency parsing.'''
CTB7_DEP_TEST = _CTB7_HOME + 'test.conll'
'''Test set for ctb7 dependency parsing.'''


================================================
FILE: hanlp/datasets/parsing/ctb8.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-10-14 20:54

from hanlp.datasets.parsing.loaders._ctb_utils import make_ctb

_CTB8_HOME = 'https://wakespace.lib.wfu.edu/bitstream/handle/10339/39379/LDC2013T21.tgz#data/'

CTB8_CWS_TRAIN = _CTB8_HOME + 'tasks/cws/train.txt'
'''Training set for ctb8 Chinese word segmentation.'''
CTB8_CWS_DEV = _CTB8_HOME + 'tasks/cws/dev.txt'
'''Dev set for ctb8 Chinese word segmentation.'''
CTB8_CWS_TEST = _CTB8_HOME + 'tasks/cws/test.txt'
'''Test set for ctb8 Chinese word segmentation.'''

CTB8_POS_TRAIN = _CTB8_HOME + 'tasks/pos/train.tsv'
'''Training set for ctb8 PoS tagging.'''
CTB8_POS_DEV = _CTB8_HOME + 'tasks/pos/dev.tsv'
'''Dev set for ctb8 PoS tagging.'''
CTB8_POS_TEST = _CTB8_HOME + 'tasks/pos/test.tsv'
'''Test set for ctb8 PoS tagging.'''

CTB8_BRACKET_LINE_TRAIN = _CTB8_HOME + 'tasks/par/train.txt'
'''Training set for ctb8 constituency parsing with empty categories.'''
CTB8_BRACKET_LINE_DEV = _CTB8_HOME + 'tasks/par/dev.txt'
'''Dev set for ctb8 constituency parsing with empty categories.'''
CTB8_BRACKET_LINE_TEST = _CTB8_HOME + 'tasks/par/test.txt'
'''Test set for ctb8 constituency parsing with empty categories.'''

CTB8_BRACKET_LINE_NOEC_TRAIN = _CTB8_HOME + 'tasks/par/train.noempty.txt'
'''Training set for ctb8 constituency parsing without empty categories.'''
CTB8_BRACKET_LINE_NOEC_DEV = _CTB8_HOME + 'tasks/par/dev.noempty.txt'
'''Dev set for ctb8 constituency parsing without empty categories.'''
CTB8_BRACKET_LINE_NOEC_TEST = _CTB8_HOME + 'tasks/par/test.noempty.txt'
'''Test set for ctb8 constituency parsing without empty categories.'''

CTB8_SD330_TRAIN = _CTB8_HOME + 'tasks/dep/train.conllx'
'''Training set for ctb8 in Stanford Dependencies 3.3.0 standard.'''
CTB8_SD330_DEV = _CTB8_HOME + 'tasks/dep/dev.conllx'
'''Dev set for ctb8 in Stanford Dependencies 3.3.0 standard.'''
CTB8_SD330_TEST = _CTB8_HOME + 'tasks/dep/test.conllx'
'''Test set for ctb8 in Stanford Dependencies 3.3.0 standard.'''

make_ctb(_CTB8_HOME)


================================================
FILE: hanlp/datasets/parsing/ctb9.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-10-14 20:54
from urllib.error import HTTPError

from hanlp.datasets.parsing.loaders._ctb_utils import make_ctb
from hanlp.utils.io_util import get_resource, path_from_url

_CTB9_HOME = 'https://catalog.ldc.upenn.edu/LDC2016T13/ctb9.0_LDC2016T13.tgz#data/'

CTB9_CWS_TRAIN = _CTB9_HOME + 'tasks/cws/train.txt'
'''Training set for ctb9 Chinese word segmentation.'''
CTB9_CWS_DEV = _CTB9_HOME + 'tasks/cws/dev.txt'
'''Dev set for ctb9 Chinese word segmentation.'''
CTB9_CWS_TEST = _CTB9_HOME + 'tasks/cws/test.txt'
'''Test set for ctb9 Chinese word segmentation.'''

CTB9_POS_TRAIN = _CTB9_HOME + 'tasks/pos/train.tsv'
'''Training set for ctb9 PoS tagging.'''
CTB9_POS_DEV = _CTB9_HOME + 'tasks/pos/dev.tsv'
'''Dev set for ctb9 PoS tagging.'''
CTB9_POS_TEST = _CTB9_HOME + 'tasks/pos/test.tsv'
'''Test set for ctb9 PoS tagging.'''

CTB9_BRACKET_LINE_TRAIN = _CTB9_HOME + 'tasks/par/train.txt'
'''Training set for ctb9 constituency parsing with empty categories.'''
CTB9_BRACKET_LINE_DEV = _CTB9_HOME + 'tasks/par/dev.txt'
'''Dev set for ctb9 constituency parsing with empty categories.'''
CTB9_BRACKET_LINE_TEST = _CTB9_HOME + 'tasks/par/test.txt'
'''Test set for ctb9 constituency parsing with empty categories.'''

CTB9_BRACKET_LINE_NOEC_TRAIN = _CTB9_HOME + 'tasks/par/train.noempty.txt'
'''Training set for ctb9 constituency parsing without empty categories.'''
CTB9_BRACKET_LINE_NOEC_DEV = _CTB9_HOME + 'tasks/par/dev.noempty.txt'
'''Dev set for ctb9 constituency parsing without empty categories.'''
CTB9_BRACKET_LINE_NOEC_TEST = _CTB9_HOME + 'tasks/par/test.noempty.txt'
'''Test set for ctb9 constituency parsing without empty categories.'''

CTB9_SD330_TRAIN = _CTB9_HOME + 'tasks/dep/train.conllx'
'''Training set for ctb9 in Stanford Dependencies 3.3.0 standard.'''
CTB9_SD330_DEV = _CTB9_HOME + 'tasks/dep/dev.conllx'
'''Dev set for ctb9 in Stanford Dependencies 3.3.0 standard.'''
CTB9_SD330_TEST = _CTB9_HOME + 'tasks/dep/test.conllx'
'''Test set for ctb9 in Stanford Dependencies 3.3.0 standard.'''

try:
    get_resource(_CTB9_HOME)
except HTTPError:
    raise FileNotFoundError(
        'Chinese Treebank 9.0 is a copyright dataset owned by LDC which we cannot re-distribute. '
        f'Please apply for a licence from LDC (https://catalog.ldc.upenn.edu/LDC2016T13) '
        f'then download it to {path_from_url(_CTB9_HOME)}'
    )

make_ctb(_CTB9_HOME)


================================================
FILE: hanlp/datasets/parsing/loaders/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:04


================================================
FILE: hanlp/datasets/parsing/loaders/_ctb_utils.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-25 16:14
import os
import shutil
import sys
from collections import defaultdict
from os import listdir
from os.path import join, isfile
from typing import List

from phrasetree.tree import Tree

from hanlp.components.parsers.conll import read_conll
from hanlp.utils.io_util import get_resource, get_exitcode_stdout_stderr, read_tsv_as_sents, run_cmd, pushd
from hanlp.utils.log_util import cprint
from hanlp.utils.time_util import CountdownTimer


# See Shao et al., 2017
# CTB9_ACADEMIA_SPLITS = {
#     'train': '''
# 0044-0143, 0170-0270, 0400-0899,
# 1001-1017, 1019, 1021-1035, 1037-
# 1043, 1045-1059, 1062-1071, 1073-
# 1117, 1120-1131, 1133-1140, 1143-
# 1147, 1149-1151, 2000-2915, 4051-
# 4099, 4112-4180, 4198-4368, 5000-
# 5446, 6000-6560, 7000-7013
#     ''',
#     'dev': '''
# 0301-0326, 2916-3030, 4100-4106,
# 4181-4189, 4369-4390, 5447-5492,
# 6561-6630, 7013-7014
#     ''',
#     'test': '''
# 0001-0043, 0144-0169, 0271-0301,
# 0900-0931, 1018, 1020, 1036, 1044,
# 1060, 1061, 1072, 1118, 1119, 1132,
# 1141, 1142, 1148, 3031-3145, 4107-
# 4111, 4190-4197, 4391-4411, 5493-
# 5558, 6631-6700, 7015-7017
#     '''
# }
#
#
# def _make_splits(splits: Dict[str, str]):
#     total = set()
#     for part, text in list(splits.items()):
#         if not isinstance(text, str):
#             continue
#         lines = text.replace('\n', '').split()
#         cids = set()
#         for line in lines:
#             for each in line.split(','):
#                 each = each.strip()
#                 if not each:
#                     continue
#                 if '-' in each:
#                     start, end = each.split('-')
#                     start, end = map(lambda x: int(x), [start, end])
#                     cids.update(range(start, end + 1))
#                     # cids.update(map(lambda x: f'{x:04d}', range(start, end)))
#                 else:
#                     cids.add(int(each))
#         cids = set(f'{x:04d}' for x in cids)
#         assert len(cids & total) == 0, f'Overlap found in {part}'
#         splits[part] = cids
#
#     return splits
#
#
# _make_splits(CTB9_ACADEMIA_SPLITS)


def convert_to_dependency(src, dst, language='zh', version='3.3.0', conllx=True, ud=False):
    cprint(f'Converting {os.path.basename(src)} to {os.path.basename(dst)} using Stanford Parser Version {version}. '
           f'It might take a while [blink][yellow]...[/yellow][/blink]')
    if version == '3.3.0':
        sp_home = 'https://nlp.stanford.edu/software/stanford-parser-full-2013-11-12.zip'
    elif version == '4.2.0':
        sp_home = 'https://nlp.stanford.edu/software/stanford-parser-4.2.0.zip'
    else:
        raise ValueError(f'Unsupported version {version}')
    sp_home = get_resource(sp_home)
    # jar_path = get_resource(f'{sp_home}#stanford-parser.jar')
    if ud:
        jclass = 'edu.stanford.nlp.trees.international.pennchinese.UniversalChineseGrammaticalStructure' if language == 'zh' \
            else 'edu.stanford.nlp.trees.ud.UniversalDependenciesConverter'
    else:
        jclass = 'edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure' if language == 'zh' \
            else 'edu.stanford.nlp.trees.EnglishGrammaticalStructure'
    cmd = f'java -cp {sp_home}/* {jclass} ' \
          f'-treeFile {src}'
    if conllx:
        cmd += ' -conllx'
    if not ud:
        cmd += f' -basic -keepPunct'
    code, out, err = get_exitcode_stdout_stderr(cmd)
    with open(dst, 'w') as f:
        f.write(out)
    if code:
        raise RuntimeError(f'Conversion failed with code {code} for {src}. The err message is:\n {err}\n'
                           f'Do you have java installed? Do you have enough memory?')


def clean_ctb_bracketed(ctb_root, out_root):
    os.makedirs(out_root, exist_ok=True)
    ctb_root = join(ctb_root, 'bracketed')
    chtbs = _list_treebank_root(ctb_root)
    timer = CountdownTimer(len(chtbs))
    for f in chtbs:
        with open(join(ctb_root, f), encoding='utf-8') as src, open(join(out_root, f + '.txt'), 'w',
                                                                    encoding='utf-8') as out:
            for line in src:
                if not line.strip().startswith('<'):
                    out.write(line)
        timer.log('Cleaning up CTB [blink][yellow]...[/yellow][/blink]', erase=False)


def _list_treebank_root(ctb_root):
    chtbs = [f for f in listdir(ctb_root) if isfile(join(ctb_root, f)) and f.startswith('chtb')]
    return sorted(chtbs)


def list_treebank(ctb_home):
    ctb_home = get_resource(ctb_home)
    cleaned_root = join(ctb_home, 'cleaned_bracket')
    return _list_treebank_root(cleaned_root)


def load_bracketed_trees(chtbs) -> List[Tree]:
    trees = []
    for f in chtbs:
        with open(f, encoding='utf-8') as src:
            content = src.read()
            trees = [x for x in content.split('\n\n') if x.strip()]
            for tree in trees:
                tree = Tree.fromstring(tree)
                trees.append(tree)
    return trees


def split_str_to_trees(text: str):
    trees = []
    buffer = []
    for line in text.split('\n'):
        if not line.strip():
            continue
        if line.startswith('('):
            if buffer:
                trees.append('\n'.join(buffer).strip())
                buffer = []
        buffer.append(line)
    if buffer:
        trees.append('\n'.join(buffer).strip())
    return trees


def make_ctb_tasks(chtbs, out_root, part):
    for task in ['cws', 'pos', 'par', 'dep']:
        os.makedirs(join(out_root, task), exist_ok=True)
    timer = CountdownTimer(len(chtbs))
    par_path = join(out_root, 'par', f'{part}.txt')
    with open(join(out_root, 'cws', f'{part}.txt'), 'w', encoding='utf-8') as cws, \
            open(join(out_root, 'pos', f'{part}.tsv'), 'w', encoding='utf-8') as pos, \
            open(par_path, 'w', encoding='utf-8') as par:
        for f in chtbs:
            with open(f, encoding='utf-8') as src:
                content = src.read()
                trees = split_str_to_trees(content)
                for tree in trees:
                    try:
                        tree = Tree.fromstring(tree)
                    except ValueError:
                        print(tree)
                        exit(1)
                    words = []
                    for word, tag in tree.pos():
                        if tag == '-NONE-' or not tag:
                            continue
                        tag = tag.split('-')[0]
                        if tag == 'X':  # 铜_NN 30_CD ｘ_X 25_CD ｘ_X 14_CD cm_NT 1999_NT
                            tag = 'FW'
                        pos.write('{}\t{}\n'.format(word, tag))
                        words.append(word)
                    cws.write(' '.join(words))
                    par.write(tree.pformat(margin=sys.maxsize))
                    for fp in cws, pos, par:
                        fp.write('\n')
            timer.log(f'Preprocesing the [blue]{part}[/blue] set of CTB [blink][yellow]...[/yellow][/blink]',
                      erase=False)
    remove_all_ec(par_path)
    dep_path = join(out_root, 'dep', f'{part}.conllx')
    convert_to_dependency(par_path, dep_path)
    sents = list(read_conll(dep_path))
    with open(dep_path, 'w') as out:
        for sent in sents:
            for i, cells in enumerate(sent):
                tag = cells[3]
                tag = tag.split('-')[0]  # NT-SHORT ---> NT
                if tag == 'X':  # 铜_NN 30_CD ｘ_X 25_CD ｘ_X 14_CD cm_NT 1999_NT
                    tag = 'FW'
                cells[3] = cells[4] = tag
                out.write('\t'.join(str(x) for x in cells))
                out.write('\n')
            out.write('\n')


def reverse_splits(splits):
    cid_domain = dict()
    for domain, cids in splits.items():
        for each in cids:
            cid_domain[each] = domain
    return cid_domain


def split_chtb(chtbs: List[str], splits=None):
    train, dev, test = [], [], []
    unused = []
    for each in chtbs:
        name, domain, ext = each.split('.', 2)
        _, cid = name.split('_')
        if splits:
            if cid in splits['train']:
                bin = train
            elif cid in splits['dev']:
                bin = dev
            elif cid in splits['test']:
                bin = test
            else:
                bin = unused
                # raise IOError(f'{name} not in any splits')
        else:
            bin = train
            if name.endswith('8'):
                bin = dev
            elif name.endswith('9'):
                bin = test
        bin.append(each)
    return train, dev, test


def id_of_chtb(each: str):
    return int(each.split('.')[0].split('_')[-1])


def make_ctb(ctb_home):
    ctb_home = get_resource(ctb_home)
    cleaned_root = join(ctb_home, 'cleaned_bracket')
    if not os.path.isdir(cleaned_root):
        clean_ctb_bracketed(ctb_home, cleaned_root)
    tasks_root = join(ctb_home, 'tasks')
    if not os.path.isdir(tasks_root):
        try:
            chtbs = _list_treebank_root(cleaned_root)
            print(f'For the {len(chtbs)} files in CTB, we apply the following splits:')
            train, dev, test = split_chtb(chtbs)
            for part, name in zip([train, dev, test], ['train', 'dev', 'test']):
                print(f'{name} = {[id_of_chtb(x) for x in part]}')
            cprint('[yellow]Each file id ending with 8/9 is put into '
                   'dev/test respectively, the rest are put into train. '
                   'Our splits ensure files are evenly split across each genre, which is recommended '
                   'for production systems.[/yellow]')
            for part, name in zip([train, dev, test], ['train', 'dev', 'test']):
                make_ctb_tasks([join(cleaned_root, x) for x in part], tasks_root, name)
            cprint('Done pre-processing CTB. Enjoy your research with [blue]HanLP[/blue]!')
        except Exception as e:
            shutil.rmtree(tasks_root, ignore_errors=True)
            raise e


def load_domains(ctb_home):
    """
    Load file ids from a Chinese treebank grouped by domains.

    Args:
        ctb_home: Root path to CTB.

    Returns:
        A dict of sets, each represents a domain.
    """
    ctb_home = get_resource(ctb_home)
    ctb_root = join(ctb_home, 'bracketed')
    chtbs = _list_treebank_root(ctb_root)
    domains = defaultdict(set)
    for each in chtbs:
        name, domain = each.split('.')
        _, fid = name.split('_')
        domains[domain].add(fid)
    return domains


def ctb_pos_to_text_format(path, delimiter='_'):
    """
    Convert ctb pos tagging corpus from tsv format to text format, where each word is followed by
    its pos tag.
    Args:
        path: File to be converted.
        delimiter: Delimiter between word and tag.
    """
    path = get_resource(path)
    name, ext = os.path.splitext(path)
    with open(f'{name}.txt', 'w', encoding='utf-8') as out:
        for sent in read_tsv_as_sents(path):
            out.write(' '.join([delimiter.join(x) for x in sent]))
            out.write('\n')


def remove_all_ec(path):
    """
    Remove empty categories for all trees in this file and save them into a "noempty" file.

    Args:
        path: File path.
    """
    script = get_resource('https://file.hankcs.com/bin/remove_ec.zip')
    with pushd(script):
        run_cmd(f'java -cp elit-ddr-0.0.5-SNAPSHOT.jar:elit-sdk-0.0.5-SNAPSHOT.jar:hanlp-1.7.8.jar:'
                f'fastutil-8.1.1.jar:. demo.RemoveEmptyCategoriesTreebank {path}')


================================================
FILE: hanlp/datasets/parsing/loaders/conll_dataset.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-08 16:10
from typing import Union, List, Callable, Dict

from hanlp_common.constant import ROOT, EOS, BOS
from hanlp.common.dataset import TransformableDataset
from hanlp.components.parsers.conll import read_conll
from hanlp.utils.io_util import TimingFileIterator


class CoNLLParsingDataset(TransformableDataset):

    def __init__(self,
                 data: Union[str, List],
                 transform: Union[Callable, List] = None,
                 cache=None,
                 generate_idx=None,
                 prune: Callable[[Dict[str, List[str]]], bool] = None) -> None:
        """General class for CoNLL style dependency parsing datasets.

        Args:
            data: The local or remote path to a dataset, or a list of samples where each sample is a dict.
            transform: Predefined transform(s).
            cache: ``True`` to enable caching, so that transforms won't be called twice.
            generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when
                samples are re-ordered by a sampler.
            prune: A filter to prune unwanted samples.
        """
        self._prune = prune
        super().__init__(data, transform, cache, generate_idx)

    def load_file(self, filepath):
        """Both ``.conllx`` and ``.conllu`` are supported. Their descriptions can be found in
        :class:`hanlp_common.conll.CoNLLWord` and :class:`hanlp_common.conll.CoNLLUWord` respectively.

        Args:
            filepath: ``.conllx`` or ``.conllu`` file path.
        """
        if filepath.endswith('.conllu'):
            # See https://universaldependencies.org/format.html
            field_names = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS',
                           'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
        else:
            field_names = ['ID', 'FORM', 'LEMMA', 'CPOS', 'POS',
                           'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL']
        fp = TimingFileIterator(filepath)
        for idx, sent in enumerate(read_conll(fp)):
            sample = {}
            for i, field in enumerate(field_names):
                sample[field] = [cell[i] for cell in sent]
            if not self._prune or not self._prune(sample):
                yield sample
            fp.log(f'{idx + 1} samples [blink][yellow]...[/yellow][/blink]')

    def __len__(self) -> int:
        return len(self.data)


def append_bos(sample: dict, pos_key='CPOS', bos=ROOT) -> dict:
    """

    Args:
        sample:
        pos_key:
        bos: A special token inserted to the head of tokens.

    Returns:

    """
    sample['token'] = [bos] + sample['FORM']
    if pos_key in sample:
        sample['pos'] = [ROOT] + sample[pos_key]
    if 'HEAD' in sample:
        sample['arc'] = [0] + sample['HEAD']
        sample['rel'] = sample['DEPREL'][:1] + sample['DEPREL']
    return sample


def append_bos_eos(sample: dict) -> dict:
    sample['token'] = [BOS] + sample['FORM'] + [EOS]
    if 'CPOS' in sample:
        sample['pos'] = [BOS] + sample['CPOS'] + [EOS]
    if 'HEAD' in sample:
        sample['arc'] = [0] + sample['HEAD'] + [0]
        sample['rel'] = sample['DEPREL'][:1] + sample['DEPREL'] + sample['DEPREL'][:1]
    return sample


def get_sibs(sample: dict) -> dict:
    heads = sample.get('arc', None)
    if heads:
        sibs = [-1] * len(heads)
        for i in range(1, len(heads)):
            hi = heads[i]
            for j in range(i + 1, len(heads)):
                hj = heads[j]
                di, dj = hi - i, hj - j
                if hi >= 0 and hj >= 0 and hi == hj and di * dj > 0:
                    if abs(di) > abs(dj):
                        sibs[i] = j
                    else:
                        sibs[j] = i
                    break
        sample['sib_id'] = [0] + sibs[1:]
    return sample


================================================
FILE: hanlp/datasets/parsing/loaders/constituency_dataset.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-28 19:27
from typing import List

from phrasetree.tree import Tree

from hanlp_common.constant import EOS, BOS
from hanlp.common.dataset import TransformableDataset


class ConstituencyDataset(TransformableDataset):
    def load_file(self, filepath: str):
        with open(filepath) as src:
            for line in src:
                line = line.strip()
                if not line:
                    continue
                yield {'constituency': Tree.fromstring(line)}


def unpack_tree_to_features(sample: dict):
    tree = sample.get('constituency', None)
    if tree:
        words, tags = zip(*tree.pos())
        chart = [[None] * (len(words) + 1) for _ in range(len(words) + 1)]
        for i, j, label in factorize(binarize(tree)[0]):
            # if no_subcategory:
            #     label = label.split('-')[0]
            chart[i][j] = label
        sample['token'] = [BOS] + list(words) + [EOS]
        sample['chart'] = chart
    return sample


def append_bos_eos(sample: dict):
    if '_con_token' not in sample:
        sample['_con_token'] = sample['token']
        sample['token'] = [BOS] + sample['token'] + [EOS]
    return sample


def remove_subcategory(sample: dict):
    tree: Tree = sample.get('constituency', None)
    if tree:
        for subtree in tree.subtrees():
            label = subtree.label()
            subtree.set_label(label.split('-')[0])
    return sample


def binarize(tree: Tree):
    r"""
    Conducts binarization over the tree.

    First, the tree is transformed to satisfy `Chomsky Normal Form (CNF)`_.
    Here we call :meth:`~tree.Tree.chomsky_normal_form` to conduct left-binarization.
    Second, all unary productions in the tree are collapsed.

    Args:
        tree (tree.Tree):
            The tree to be binarized.

    Returns:
        The binarized tree.

    Examples:
        >>> tree = Tree.fromstring('''
                                        (TOP
                                          (S
                                            (NP (_ She))
                                            (VP (_ enjoys) (S (VP (_ playing) (NP (_ tennis)))))
                                            (_ .)))
                                        ''')
        >>> print(Tree.binarize(tree))
        (TOP
          (S
            (S|<>
              (NP (_ She))
              (VP
                (VP|<> (_ enjoys))
                (S+VP (VP|<> (_ playing)) (NP (_ tennis)))))
            (S|<> (_ .))))

    .. _Chomsky Normal Form (CNF):
        https://en.wikipedia.org/wiki/Chomsky_normal_form
    """

    tree: Tree = tree.copy(True)
    nodes = [tree]
    while nodes:
        node = nodes.pop()
        if isinstance(node, Tree):
            nodes.extend([child for child in node])
            if len(node) > 1:
                for i, child in enumerate(node):
                    if not isinstance(child[0], Tree):
                        node[i] = Tree(f"{node.label()}|<>", [child])
    tree.chomsky_normal_form('left', 0, 0)
    tree.collapse_unary()

    return tree


def factorize(tree, delete_labels=None, equal_labels=None):
    r"""
    Factorizes the tree into a sequence.
    The tree is traversed in pre-order.

    Args:
        tree (tree.Tree):
            The tree to be factorized.
        delete_labels (set[str]):
            A set of labels to be ignored. This is used for evaluation.
            If it is a pre-terminal label, delete the word along with the brackets.
            If it is a non-terminal label, just delete the brackets (don't delete childrens).
            In `EVALB`_, the default set is:
            {'TOP', 'S1', '-NONE-', ',', ':', '``', "''", '.', '?', '!', ''}
            Default: ``None``.
        equal_labels (dict[str, str]):
            The key-val pairs in the dict are considered equivalent (non-directional). This is used for evaluation.
            The default dict defined in `EVALB`_ is: {'ADVP': 'PRT'}
            Default: ``None``.

    Returns:
        The sequence of the factorized tree.

    Examples:
        >>> tree = Tree.fromstring('' (TOP
                                          (S
                                            (NP (_ She))
                                            (VP (_ enjoys) (S (VP (_ playing) (NP (_ tennis)))))
                                            (_ .)))
                                    '')
        >>> Tree.factorize(tree)
        [(0, 5, 'TOP'), (0, 5, 'S'), (0, 1, 'NP'), (1, 4, 'VP'), (2, 4, 'S'), (2, 4, 'VP'), (3, 4, 'NP')]
        >>> Tree.factorize(tree, delete_labels={'TOP', 'S1', '-NONE-', ',', ':', '``', "''", '.', '?', '!', ''})
        [(0, 5, 'S'), (0, 1, 'NP'), (1, 4, 'VP'), (2, 4, 'S'), (2, 4, 'VP'), (3, 4, 'NP')]

    .. _EVALB:
        https://nlp.cs.nyu.edu/evalb/
    """

    def track(tree, i):
        label = tree.label()
        if delete_labels is not None and label in delete_labels:
            label = None
        if equal_labels is not None:
            label = equal_labels.get(label, label)
        if len(tree) == 1 and not isinstance(tree[0], Tree):
            return (i + 1 if label is not None else i), []
        j, spans = i, []
        for child in tree:
            if isinstance(child, Tree):
                j, s = track(child, j)
                spans += s
        if label is not None and j > i:
            spans = [(i, j, label)] + spans
        return j, spans

    return track(tree, 0)[1]


def build_tree(tokens: List[str], sequence):
    r"""
    Builds a constituency tree from the sequence. The sequence is generated in pre-order.
    During building the tree, the sequence is de-binarized to the original format (i.e.,
    the suffixes ``|<>`` are ignored, the collapsed labels are recovered).

    Args:
        tokens :
            All tokens in a sentence.
        sequence (list[tuple]):
            A list of tuples used for generating a tree.
            Each tuple consits of the indices of left/right span boundaries and label of the span.

    Returns:
        A result constituency tree.

    Examples:
        >>> tree = Tree.totree(['She', 'enjoys', 'playing', 'tennis', '.'], 'TOP')
        >>> sequence = [(0, 5, 'S'), (0, 4, 'S|<>'), (0, 1, 'NP'), (1, 4, 'VP'), (1, 2, 'VP|<>'),
                        (2, 4, 'S+VP'), (2, 3, 'VP|<>'), (3, 4, 'NP'), (4, 5, 'S|<>')]
        >>> print(Tree.build_tree(root, sequence))
        (TOP
          (S
            (NP (_ She))
            (VP (_ enjoys) (S (VP (_ playing) (NP (_ tennis)))))
            (_ .)))
    """
    if not tokens:  # User passed in [], which is the tokenized result of ''
        return Tree('TOP', [])
    tree = Tree('TOP', [Tree('_', [t]) for t in tokens])
    root = tree.label()
    leaves = [subtree for subtree in tree.subtrees() if not isinstance(subtree[0], Tree)]

    def track(node):
        i, j, label = next(node)
        if j == i + 1:
            children = [leaves[i]]
        else:
            children = track(node) + track(node)
        if label.endswith('|<>'):
            return children
        labels = label.split('+')
        tree = Tree(labels[-1], children)
        for label in reversed(labels[:-1]):
            tree = Tree(label, [tree])
        return [tree]

    return Tree(root, track(iter(sequence)))


================================================
FILE: hanlp/datasets/parsing/pmt1.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-02-15 04:14
import os.path

from hanlp.utils.io_util import get_resource
from hanlp.utils.log_util import cprint
from hanlp_common.conll import CoNLLSentence, CoNLLWord

_HOME = 'https://github.com/qiulikun/PKUMultiviewTreebank/archive/refs/heads/master.zip'
PTM_V1_RAW = _HOME + '#199801_dependency_treebank_2014pos.txt'
PTM_V1_TRAIN = _HOME + '#train.conllx'
'The training set of PKU Multi-view Chinese Treebank (PMT) 1.0 (:cite:`qiu-etal-2014-multi`).'
PTM_V1_DEV = _HOME + '#dev.conllx'
'The dev set of PKU Multi-view Chinese Treebank (PMT) 1.0 (:cite:`qiu-etal-2014-multi`).'
PTM_V1_TEST = _HOME + '#test.conllx'
'The test set of PKU Multi-view Chinese Treebank (PMT) 1.0 (:cite:`qiu-etal-2014-multi`).'


def _make_ptm():
    raw = get_resource(PTM_V1_RAW)
    home = os.path.dirname(raw)
    done = True
    for part in ['train', 'dev', 'test']:
        if not os.path.isfile(os.path.join(home, f'{part}.conllx')):
            done = False
            break
    if done:
        return
    sents = []
    with open(raw) as src:
        buffer = []
        for line in src:
            line = line.strip()
            if line:
                buffer.append(line)
            else:
                if buffer:
                    tok, pos, rel, arc = [x.split() for x in buffer]
                    sent = CoNLLSentence()
                    for i, (t, p, r, a) in enumerate(zip(tok, pos, rel, arc)):
                        sent.append(CoNLLWord(i + 1, form=t, cpos=p, head=a, deprel=r))
                    sents.append(sent)
                    buffer.clear()

    prev_offset = 0
    # Sentences 12001-13000 and 13001-14463 are used as the development and test set, respectively. The remaining
    # sentences are used as training data.
    for part, offset in zip(['train', 'dev', 'test'], [12000, 13000, 14463]):
        with open(os.path.join(home, f'{part}.conllx'), 'w') as out:
            portion = sents[prev_offset:offset]
            cprint(f'[yellow]{len(portion)}[/yellow] sentences [cyan][{prev_offset + 1}:{offset})[/cyan] in {part}')
            for sent in portion:
                out.write(str(sent) + '\n\n')
        prev_offset = offset


_make_ptm()


================================================
FILE: hanlp/datasets/parsing/ptb.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-02-17 15:46

_PTB_HOME = 'https://github.com/KhalilMrini/LAL-Parser/archive/master.zip#data/'

PTB_TRAIN = _PTB_HOME + '02-21.10way.clean'
'''Training set of PTB without empty categories. PoS tags are automatically predicted using 10-fold 
jackknifing (:cite:`collins-koo-2005-discriminative`).'''
PTB_DEV = _PTB_HOME + '22.auto.clean'
'''Dev set of PTB without empty categories. PoS tags are automatically predicted using 10-fold 
jackknifing (:cite:`collins-koo-2005-discriminative`).'''
PTB_TEST = _PTB_HOME + '23.auto.clean'
'''Test set of PTB without empty categories. PoS tags are automatically predicted using 10-fold 
jackknifing (:cite:`collins-koo-2005-discriminative`).'''

PTB_SD330_TRAIN = _PTB_HOME + 'ptb_train_3.3.0.sd.clean'
'''Training set of PTB in Stanford Dependencies 3.3.0 format. PoS tags are automatically predicted using 10-fold 
jackknifing (:cite:`collins-koo-2005-discriminative`).'''
PTB_SD330_DEV = _PTB_HOME + 'ptb_dev_3.3.0.sd.clean'
'''Dev set of PTB in Stanford Dependencies 3.3.0 format. PoS tags are automatically predicted using 10-fold 
jackknifing (:cite:`collins-koo-2005-discriminative`).'''
PTB_SD330_TEST = _PTB_HOME + 'ptb_test_3.3.0.sd.clean'
'''Test set of PTB in Stanford Dependencies 3.3.0 format. PoS tags are automatically predicted using 10-fold 
jackknifing (:cite:`collins-koo-2005-discriminative`).'''

PTB_TOKEN_MAPPING = {
    "-LRB-": "(",
    "-RRB-": ")",
    "-LCB-": "{",
    "-RCB-": "}",
    "-LSB-": "[",
    "-RSB-": "]",
    "``": '"',
    "''": '"',
    "`": "'",
    '«': '"',
    '»': '"',
    '‘': "'",
    '’': "'",
    '“': '"',
    '”': '"',
    '„': '"',
    '‹': "'",
    '›': "'",
    "\u2013": "--",  # en dash
    "\u2014": "--",  # em dash
}


================================================
FILE: hanlp/datasets/parsing/semeval15.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-28 14:40
# from hanlp.datasets.parsing.conll_dataset import CoNLLParsingDataset
#
#
# class SemEval15Dataset(CoNLLParsingDataset):
#     def load_file(self, filepath: str):
#         pass
import warnings

from hanlp_common.constant import ROOT, PAD
from hanlp_common.conll import CoNLLSentence


def unpack_deps_to_head_deprel(sample: dict, pad_rel=None, arc_key='arc', rel_key='rel'):
    if 'DEPS' in sample:
        deps = ['_'] + sample['DEPS']
        sample[arc_key] = arc = []
        sample[rel_key] = rel = []
        for each in deps:
            arc_per_token = [False] * len(deps)
            rel_per_token = [None] * len(deps)
            if each != '_':
                for ar in each.split('|'):
                    a, r = ar.split(':')
                    a = int(a)
                    arc_per_token[a] = True
                    rel_per_token[a] = r
                    if not pad_rel:
                        pad_rel = r
            arc.append(arc_per_token)
            rel.append(rel_per_token)
        if not pad_rel:
            pad_rel = PAD
        for i in range(len(rel)):
            rel[i] = [r if r else pad_rel for r in rel[i]]
    return sample


def append_bos_to_form_pos(sample, pos_key='CPOS'):
    sample['token'] = [ROOT] + sample['FORM']
    if pos_key in sample:
        sample['pos'] = [ROOT] + sample[pos_key]
    return sample


def merge_head_deprel_with_2nd(sample: dict):
    if 'arc' in sample:
        arc_2nd = sample['arc_2nd']
        rel_2nd = sample['rel_2nd']
        for i, (arc, rel) in enumerate(zip(sample['arc'], sample['rel'])):
            if i:
                if arc_2nd[i][arc] and rel_2nd[i][arc] != rel:
                    sample_str = CoNLLSentence.from_dict(sample, conllu=True).to_markdown()
                    warnings.warn(f'The main dependency conflicts with 2nd dependency at ID={i}, ' \
                                  'which means joint mode might not be suitable. ' \
                                  f'The sample is\n{sample_str}')
                arc_2nd[i][arc] = True
                rel_2nd[i][arc] = rel
    return sample


================================================
FILE: hanlp/datasets/parsing/semeval16.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 00:51
from hanlp_common.conll import CoNLLSentence
import os

from hanlp.utils.io_util import get_resource, merge_files
from hanlp_common.io import eprint

_SEMEVAL2016_HOME = 'https://github.com/HIT-SCIR/SemEval-2016/archive/master.zip'

SEMEVAL2016_NEWS_TRAIN = _SEMEVAL2016_HOME + '#train/news.train.conll'
SEMEVAL2016_NEWS_DEV = _SEMEVAL2016_HOME + '#validation/news.valid.conll'
SEMEVAL2016_NEWS_TEST = _SEMEVAL2016_HOME + '#test/news.test.conll'

SEMEVAL2016_NEWS_TRAIN_CONLLU = _SEMEVAL2016_HOME + '#train/news.train.conllu'
SEMEVAL2016_NEWS_DEV_CONLLU = _SEMEVAL2016_HOME + '#validation/news.valid.conllu'
SEMEVAL2016_NEWS_TEST_CONLLU = _SEMEVAL2016_HOME + '#test/news.test.conllu'

SEMEVAL2016_TEXT_TRAIN = _SEMEVAL2016_HOME + '#train/text.train.conll'
SEMEVAL2016_TEXT_DEV = _SEMEVAL2016_HOME + '#validation/text.valid.conll'
SEMEVAL2016_TEXT_TEST = _SEMEVAL2016_HOME + '#test/text.test.conll'

SEMEVAL2016_TEXT_TRAIN_CONLLU = _SEMEVAL2016_HOME + '#train/text.train.conllu'
SEMEVAL2016_TEXT_DEV_CONLLU = _SEMEVAL2016_HOME + '#validation/text.valid.conllu'
SEMEVAL2016_TEXT_TEST_CONLLU = _SEMEVAL2016_HOME + '#test/text.test.conllu'

SEMEVAL2016_FULL_TRAIN_CONLLU = _SEMEVAL2016_HOME + '#train/full.train.conllu'
SEMEVAL2016_FULL_DEV_CONLLU = _SEMEVAL2016_HOME + '#validation/full.valid.conllu'
SEMEVAL2016_FULL_TEST_CONLLU = _SEMEVAL2016_HOME + '#test/full.test.conllu'


def convert_conll_to_conllu(path):
    sents = CoNLLSentence.from_file(path, conllu=True)
    with open(os.path.splitext(path)[0] + '.conllu', 'w') as out:
        for sent in sents:
            for word in sent:
                if not word.deps:
                    word.deps = [(word.head, word.deprel)]
                    word.head = None
                    word.deprel = None
            out.write(str(sent))
            out.write('\n\n')


for file in [SEMEVAL2016_NEWS_TRAIN, SEMEVAL2016_NEWS_DEV, SEMEVAL2016_NEWS_TEST,
             SEMEVAL2016_TEXT_TRAIN, SEMEVAL2016_TEXT_DEV, SEMEVAL2016_TEXT_TEST]:
    file = get_resource(file)
    conllu = os.path.splitext(file)[0] + '.conllu'
    if not os.path.isfile(conllu):
        eprint(f'Converting {os.path.basename(file)} to {os.path.basename(conllu)} ...')
        convert_conll_to_conllu(file)

for group, part in zip([[SEMEVAL2016_NEWS_TRAIN_CONLLU, SEMEVAL2016_TEXT_TRAIN_CONLLU],
                        [SEMEVAL2016_NEWS_DEV_CONLLU, SEMEVAL2016_TEXT_DEV_CONLLU],
                        [SEMEVAL2016_NEWS_TEST_CONLLU, SEMEVAL2016_TEXT_TEST_CONLLU]],
                       ['train', 'valid', 'test']):
    root = get_resource(_SEMEVAL2016_HOME)
    dst = f'{root}/train/full.{part}.conllu'
    if not os.path.isfile(dst):
        group = [get_resource(x) for x in group]
        eprint(f'Concatenating {os.path.basename(group[0])} and {os.path.basename(group[1])} '
               f'into full dataset {os.path.basename(dst)} ...')
        merge_files(group, dst)


================================================
FILE: hanlp/datasets/parsing/ud/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-07 21:45
import os
import shutil

from hanlp.components.parsers.ud.udify_util import get_ud_treebank_files
from hanlp.utils.io_util import get_resource
from hanlp.utils.log_util import flash


def concat_treebanks(home, version):
    ud_home = get_resource(home)
    treebanks = get_ud_treebank_files(ud_home)
    output_dir = os.path.abspath(os.path.join(ud_home, os.path.pardir, os.path.pardir, f'ud-multilingual-v{version}'))
    if os.path.isdir(output_dir):
        return output_dir
    os.makedirs(output_dir)
    train, dev, test = list(zip(*[treebanks[k] for k in treebanks]))

    for treebank, name in zip([train, dev, test], ["train.conllu", "dev.conllu", "test.conllu"]):
        flash(f'Concatenating {len(train)} treebanks into {name} [blink][yellow]...[/yellow][/blink]')
        with open(os.path.join(output_dir, name), 'w') as write:
            for t in treebank:
                if not t:
                    continue
                with open(t, 'r') as read:
                    shutil.copyfileobj(read, write)
        flash('')
    return output_dir


================================================
FILE: hanlp/datasets/parsing/ud/ud210.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-07 21:03
import glob
import os

from hanlp.utils.io_util import uncompress, get_resource

_UD_210_URL = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-4758/allzip"
_UD_210_HOME = _UD_210_URL + '#ud-treebanks-v2.10/'
_path = get_resource(_UD_210_URL)
if os.path.isfile(_path):
    os.rename(_path, _path + '.zip')
    uncompress(_path + '.zip')
    uncompress(os.path.join(_path, 'ud-treebanks-v2.10.tgz'))


# noinspection PyShadowingNames
def _list_dir(path, home):
    prefix = home.lstrip('_').replace('_HOME', '')

    path = get_resource(path)
    with open('ud210.py', 'a') as out:
        for f in sorted(glob.glob(path + '/ud-treebanks-v2.10/UD_*')):
            basename = os.path.basename(f)
            name = basename[len('UD_'):]
            name = name.upper().replace('-', '_')
            for split in 'train', 'dev', 'test':
                sp = glob.glob(f + f'/*{split}.conllu')
                if not sp:
                    continue
                sp = os.path.basename(sp[0])
                out.write(f'{prefix}_{name}_{split.upper()} = {home} + "{basename}/{sp}"\n')
                out.write(f'"{prefix} {split} set of {name}."\n')


def main():
    _list_dir(_UD_210_URL, '_UD_210_HOME')
    pass


if __name__ == '__main__':
    main()
UD_210_AFRIKAANS_AFRIBOOMS_TRAIN = _UD_210_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu"
"UD_210 train set of AFRIKAANS_AFRIBOOMS."
UD_210_AFRIKAANS_AFRIBOOMS_DEV = _UD_210_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu"
"UD_210 dev set of AFRIKAANS_AFRIBOOMS."
UD_210_AFRIKAANS_AFRIBOOMS_TEST = _UD_210_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu"
"UD_210 test set of AFRIKAANS_AFRIBOOMS."
UD_210_AKKADIAN_PISANDUB_TEST = _UD_210_HOME + "UD_Akkadian-PISANDUB/akk_pisandub-ud-test.conllu"
"UD_210 test set of AKKADIAN_PISANDUB."
UD_210_AKKADIAN_RIAO_TEST = _UD_210_HOME + "UD_Akkadian-RIAO/akk_riao-ud-test.conllu"
"UD_210 test set of AKKADIAN_RIAO."
UD_210_AKUNTSU_TUDET_TEST = _UD_210_HOME + "UD_Akuntsu-TuDeT/aqz_tudet-ud-test.conllu"
"UD_210 test set of AKUNTSU_TUDET."
UD_210_ALBANIAN_TSA_TEST = _UD_210_HOME + "UD_Albanian-TSA/sq_tsa-ud-test.conllu"
"UD_210 test set of ALBANIAN_TSA."
UD_210_AMHARIC_ATT_TEST = _UD_210_HOME + "UD_Amharic-ATT/am_att-ud-test.conllu"
"UD_210 test set of AMHARIC_ATT."
UD_210_ANCIENT_GREEK_PROIEL_TRAIN = _UD_210_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu"
"UD_210 train set of ANCIENT_GREEK_PROIEL."
UD_210_ANCIENT_GREEK_PROIEL_DEV = _UD_210_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu"
"UD_210 dev set of ANCIENT_GREEK_PROIEL."
UD_210_ANCIENT_GREEK_PROIEL_TEST = _UD_210_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-test.conllu"
"UD_210 test set of ANCIENT_GREEK_PROIEL."
UD_210_ANCIENT_GREEK_PERSEUS_TRAIN = _UD_210_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu"
"UD_210 train set of ANCIENT_GREEK_PERSEUS."
UD_210_ANCIENT_GREEK_PERSEUS_DEV = _UD_210_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu"
"UD_210 dev set of ANCIENT_GREEK_PERSEUS."
UD_210_ANCIENT_GREEK_PERSEUS_TEST = _UD_210_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-test.conllu"
"UD_210 test set of ANCIENT_GREEK_PERSEUS."
UD_210_ANCIENT_HEBREW_PTNK_TRAIN = _UD_210_HOME + "UD_Ancient_Hebrew-PTNK/hbo_ptnk-ud-train.conllu"
"UD_210 train set of ANCIENT_HEBREW_PTNK."
UD_210_ANCIENT_HEBREW_PTNK_DEV = _UD_210_HOME + "UD_Ancient_Hebrew-PTNK/hbo_ptnk-ud-dev.conllu"
"UD_210 dev set of ANCIENT_HEBREW_PTNK."
UD_210_ANCIENT_HEBREW_PTNK_TEST = _UD_210_HOME + "UD_Ancient_Hebrew-PTNK/hbo_ptnk-ud-test.conllu"
"UD_210 test set of ANCIENT_HEBREW_PTNK."
UD_210_APURINA_UFPA_TEST = _UD_210_HOME + "UD_Apurina-UFPA/apu_ufpa-ud-test.conllu"
"UD_210 test set of APURINA_UFPA."
UD_210_ARABIC_NYUAD_TRAIN = _UD_210_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-train.conllu"
"UD_210 train set of ARABIC_NYUAD."
UD_210_ARABIC_NYUAD_DEV = _UD_210_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-dev.conllu"
"UD_210 dev set of ARABIC_NYUAD."
UD_210_ARABIC_NYUAD_TEST = _UD_210_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-test.conllu"
"UD_210 test set of ARABIC_NYUAD."
UD_210_ARABIC_PADT_TRAIN = _UD_210_HOME + "UD_Arabic-PADT/ar_padt-ud-train.conllu"
"UD_210 train set of ARABIC_PADT."
UD_210_ARABIC_PADT_DEV = _UD_210_HOME + "UD_Arabic-PADT/ar_padt-ud-dev.conllu"
"UD_210 dev set of ARABIC_PADT."
UD_210_ARABIC_PADT_TEST = _UD_210_HOME + "UD_Arabic-PADT/ar_padt-ud-test.conllu"
"UD_210 test set of ARABIC_PADT."
UD_210_ARABIC_PUD_TEST = _UD_210_HOME + "UD_Arabic-PUD/ar_pud-ud-test.conllu"
"UD_210 test set of ARABIC_PUD."
UD_210_ARMENIAN_ARMTDP_TRAIN = _UD_210_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu"
"UD_210 train set of ARMENIAN_ARMTDP."
UD_210_ARMENIAN_ARMTDP_DEV = _UD_210_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-dev.conllu"
"UD_210 dev set of ARMENIAN_ARMTDP."
UD_210_ARMENIAN_ARMTDP_TEST = _UD_210_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-test.conllu"
"UD_210 test set of ARMENIAN_ARMTDP."
UD_210_ARMENIAN_BSUT_TRAIN = _UD_210_HOME + "UD_Armenian-BSUT/hy_bsut-ud-train.conllu"
"UD_210 train set of ARMENIAN_BSUT."
UD_210_ARMENIAN_BSUT_DEV = _UD_210_HOME + "UD_Armenian-BSUT/hy_bsut-ud-dev.conllu"
"UD_210 dev set of ARMENIAN_BSUT."
UD_210_ARMENIAN_BSUT_TEST = _UD_210_HOME + "UD_Armenian-BSUT/hy_bsut-ud-test.conllu"
"UD_210 test set of ARMENIAN_BSUT."
UD_210_ASSYRIAN_AS_TEST = _UD_210_HOME + "UD_Assyrian-AS/aii_as-ud-test.conllu"
"UD_210 test set of ASSYRIAN_AS."
UD_210_BAMBARA_CRB_TEST = _UD_210_HOME + "UD_Bambara-CRB/bm_crb-ud-test.conllu"
"UD_210 test set of BAMBARA_CRB."
UD_210_BASQUE_BDT_TRAIN = _UD_210_HOME + "UD_Basque-BDT/eu_bdt-ud-train.conllu"
"UD_210 train set of BASQUE_BDT."
UD_210_BASQUE_BDT_DEV = _UD_210_HOME + "UD_Basque-BDT/eu_bdt-ud-dev.conllu"
"UD_210 dev set of BASQUE_BDT."
UD_210_BASQUE_BDT_TEST = _UD_210_HOME + "UD_Basque-BDT/eu_bdt-ud-test.conllu"
"UD_210 test set of BASQUE_BDT."
UD_210_BEJA_NSC_TEST = _UD_210_HOME + "UD_Beja-NSC/bej_nsc-ud-test.conllu"
"UD_210 test set of BEJA_NSC."
UD_210_BELARUSIAN_HSE_TRAIN = _UD_210_HOME + "UD_Belarusian-HSE/be_hse-ud-train.conllu"
"UD_210 train set of BELARUSIAN_HSE."
UD_210_BELARUSIAN_HSE_DEV = _UD_210_HOME + "UD_Belarusian-HSE/be_hse-ud-dev.conllu"
"UD_210 dev set of BELARUSIAN_HSE."
UD_210_BELARUSIAN_HSE_TEST = _UD_210_HOME + "UD_Belarusian-HSE/be_hse-ud-test.conllu"
"UD_210 test set of BELARUSIAN_HSE."
UD_210_BENGALI_BRU_TEST = _UD_210_HOME + "UD_Bengali-BRU/bn_bru-ud-test.conllu"
"UD_210 test set of BENGALI_BRU."
UD_210_BHOJPURI_BHTB_TEST = _UD_210_HOME + "UD_Bhojpuri-BHTB/bho_bhtb-ud-test.conllu"
"UD_210 test set of BHOJPURI_BHTB."
UD_210_BRETON_KEB_TEST = _UD_210_HOME + "UD_Breton-KEB/br_keb-ud-test.conllu"
"UD_210 test set of BRETON_KEB."
UD_210_BULGARIAN_BTB_TRAIN = _UD_210_HOME + "UD_Bulgarian-BTB/bg_btb-ud-train.conllu"
"UD_210 train set of BULGARIAN_BTB."
UD_210_BULGARIAN_BTB_DEV = _UD_210_HOME + "UD_Bulgarian-BTB/bg_btb-ud-dev.conllu"
"UD_210 dev set of BULGARIAN_BTB."
UD_210_BULGARIAN_BTB_TEST = _UD_210_HOME + "UD_Bulgarian-BTB/bg_btb-ud-test.conllu"
"UD_210 test set of BULGARIAN_BTB."
UD_210_BURYAT_BDT_TRAIN = _UD_210_HOME + "UD_Buryat-BDT/bxr_bdt-ud-train.conllu"
"UD_210 train set of BURYAT_BDT."
UD_210_BURYAT_BDT_TEST = _UD_210_HOME + "UD_Buryat-BDT/bxr_bdt-ud-test.conllu"
"UD_210 test set of BURYAT_BDT."
UD_210_CANTONESE_HK_TEST = _UD_210_HOME + "UD_Cantonese-HK/yue_hk-ud-test.conllu"
"UD_210 test set of CANTONESE_HK."
UD_210_CATALAN_ANCORA_TRAIN = _UD_210_HOME + "UD_Catalan-AnCora/ca_ancora-ud-train.conllu"
"UD_210 train set of CATALAN_ANCORA."
UD_210_CATALAN_ANCORA_DEV = _UD_210_HOME + "UD_Catalan-AnCora/ca_ancora-ud-dev.conllu"
"UD_210 dev set of CATALAN_ANCORA."
UD_210_CATALAN_ANCORA_TEST = _UD_210_HOME + "UD_Catalan-AnCora/ca_ancora-ud-test.conllu"
"UD_210 test set of CATALAN_ANCORA."
UD_210_CEBUANO_GJA_TEST = _UD_210_HOME + "UD_Cebuano-GJA/ceb_gja-ud-test.conllu"
"UD_210 test set of CEBUANO_GJA."
UD_210_CHINESE_CFL_TEST = _UD_210_HOME + "UD_Chinese-CFL/zh_cfl-ud-test.conllu"
"UD_210 test set of CHINESE_CFL."
UD_210_CHINESE_GSD_TRAIN = _UD_210_HOME + "UD_Chinese-GSD/zh_gsd-ud-train.conllu"
"UD_210 train set of CHINESE_GSD."
UD_210_CHINESE_GSD_DEV = _UD_210_HOME + "UD_Chinese-GSD/zh_gsd-ud-dev.conllu"
"UD_210 dev set of CHINESE_GSD."
UD_210_CHINESE_GSD_TEST = _UD_210_HOME + "UD_Chinese-GSD/zh_gsd-ud-test.conllu"
"UD_210 test set of CHINESE_GSD."
UD_210_CHINESE_GSDSIMP_TRAIN = _UD_210_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-train.conllu"
"UD_210 train set of CHINESE_GSDSIMP."
UD_210_CHINESE_GSDSIMP_DEV = _UD_210_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-dev.conllu"
"UD_210 dev set of CHINESE_GSDSIMP."
UD_210_CHINESE_GSDSIMP_TEST = _UD_210_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-test.conllu"
"UD_210 test set of CHINESE_GSDSIMP."
UD_210_CHINESE_HK_TEST = _UD_210_HOME + "UD_Chinese-HK/zh_hk-ud-test.conllu"
"UD_210 test set of CHINESE_HK."
UD_210_CHINESE_PUD_TEST = _UD_210_HOME + "UD_Chinese-PUD/zh_pud-ud-test.conllu"
"UD_210 test set of CHINESE_PUD."
UD_210_CHUKCHI_HSE_TEST = _UD_210_HOME + "UD_Chukchi-HSE/ckt_hse-ud-test.conllu"
"UD_210 test set of CHUKCHI_HSE."
UD_210_CLASSICAL_CHINESE_KYOTO_TRAIN = _UD_210_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-train.conllu"
"UD_210 train set of CLASSICAL_CHINESE_KYOTO."
UD_210_CLASSICAL_CHINESE_KYOTO_DEV = _UD_210_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-dev.conllu"
"UD_210 dev set of CLASSICAL_CHINESE_KYOTO."
UD_210_CLASSICAL_CHINESE_KYOTO_TEST = _UD_210_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-test.conllu"
"UD_210 test set of CLASSICAL_CHINESE_KYOTO."
UD_210_COPTIC_SCRIPTORIUM_TRAIN = _UD_210_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu"
"UD_210 train set of COPTIC_SCRIPTORIUM."
UD_210_COPTIC_SCRIPTORIUM_DEV = _UD_210_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu"
"UD_210 dev set of COPTIC_SCRIPTORIUM."
UD_210_COPTIC_SCRIPTORIUM_TEST = _UD_210_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-test.conllu"
"UD_210 test set of COPTIC_SCRIPTORIUM."
UD_210_CROATIAN_SET_TRAIN = _UD_210_HOME + "UD_Croatian-SET/hr_set-ud-train.conllu"
"UD_210 train set of CROATIAN_SET."
UD_210_CROATIAN_SET_DEV = _UD_210_HOME + "UD_Croatian-SET/hr_set-ud-dev.conllu"
"UD_210 dev set of CROATIAN_SET."
UD_210_CROATIAN_SET_TEST = _UD_210_HOME + "UD_Croatian-SET/hr_set-ud-test.conllu"
"UD_210 test set of CROATIAN_SET."
UD_210_CZECH_CAC_TRAIN = _UD_210_HOME + "UD_Czech-CAC/cs_cac-ud-train.conllu"
"UD_210 train set of CZECH_CAC."
UD_210_CZECH_CAC_DEV = _UD_210_HOME + "UD_Czech-CAC/cs_cac-ud-dev.conllu"
"UD_210 dev set of CZECH_CAC."
UD_210_CZECH_CAC_TEST = _UD_210_HOME + "UD_Czech-CAC/cs_cac-ud-test.conllu"
"UD_210 test set of CZECH_CAC."
UD_210_CZECH_CLTT_TRAIN = _UD_210_HOME + "UD_Czech-CLTT/cs_cltt-ud-train.conllu"
"UD_210 train set of CZECH_CLTT."
UD_210_CZECH_CLTT_DEV = _UD_210_HOME + "UD_Czech-CLTT/cs_cltt-ud-dev.conllu"
"UD_210 dev set of CZECH_CLTT."
UD_210_CZECH_CLTT_TEST = _UD_210_HOME + "UD_Czech-CLTT/cs_cltt-ud-test.conllu"
"UD_210 test set of CZECH_CLTT."
UD_210_CZECH_FICTREE_TRAIN = _UD_210_HOME + "UD_Czech-FicTree/cs_fictree-ud-train.conllu"
"UD_210 train set of CZECH_FICTREE."
UD_210_CZECH_FICTREE_DEV = _UD_210_HOME + "UD_Czech-FicTree/cs_fictree-ud-dev.conllu"
"UD_210 dev set of CZECH_FICTREE."
UD_210_CZECH_FICTREE_TEST = _UD_210_HOME + "UD_Czech-FicTree/cs_fictree-ud-test.conllu"
"UD_210 test set of CZECH_FICTREE."
UD_210_CZECH_PDT_TRAIN = _UD_210_HOME + "UD_Czech-PDT/cs_pdt-ud-train.conllu"
"UD_210 train set of CZECH_PDT."
UD_210_CZECH_PDT_DEV = _UD_210_HOME + "UD_Czech-PDT/cs_pdt-ud-dev.conllu"
"UD_210 dev set of CZECH_PDT."
UD_210_CZECH_PDT_TEST = _UD_210_HOME + "UD_Czech-PDT/cs_pdt-ud-test.conllu"
"UD_210 test set of CZECH_PDT."
UD_210_CZECH_PUD_TEST = _UD_210_HOME + "UD_Czech-PUD/cs_pud-ud-test.conllu"
"UD_210 test set of CZECH_PUD."
UD_210_DANISH_DDT_TRAIN = _UD_210_HOME + "UD_Danish-DDT/da_ddt-ud-train.conllu"
"UD_210 train set of DANISH_DDT."
UD_210_DANISH_DDT_DEV = _UD_210_HOME + "UD_Danish-DDT/da_ddt-ud-dev.conllu"
"UD_210 dev set of DANISH_DDT."
UD_210_DANISH_DDT_TEST = _UD_210_HOME + "UD_Danish-DDT/da_ddt-ud-test.conllu"
"UD_210 test set of DANISH_DDT."
UD_210_DUTCH_ALPINO_TRAIN = _UD_210_HOME + "UD_Dutch-Alpino/nl_alpino-ud-train.conllu"
"UD_210 train set of DUTCH_ALPINO."
UD_210_DUTCH_ALPINO_DEV = _UD_210_HOME + "UD_Dutch-Alpino/nl_alpino-ud-dev.conllu"
"UD_210 dev set of DUTCH_ALPINO."
UD_210_DUTCH_ALPINO_TEST = _UD_210_HOME + "UD_Dutch-Alpino/nl_alpino-ud-test.conllu"
"UD_210 test set of DUTCH_ALPINO."
UD_210_DUTCH_LASSYSMALL_TRAIN = _UD_210_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu"
"UD_210 train set of DUTCH_LASSYSMALL."
UD_210_DUTCH_LASSYSMALL_DEV = _UD_210_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu"
"UD_210 dev set of DUTCH_LASSYSMALL."
UD_210_DUTCH_LASSYSMALL_TEST = _UD_210_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-test.conllu"
"UD_210 test set of DUTCH_LASSYSMALL."
UD_210_ENGLISH_ATIS_TRAIN = _UD_210_HOME + "UD_English-Atis/en_atis-ud-train.conllu"
"UD_210 train set of ENGLISH_ATIS."
UD_210_ENGLISH_ATIS_DEV = _UD_210_HOME + "UD_English-Atis/en_atis-ud-dev.conllu"
"UD_210 dev set of ENGLISH_ATIS."
UD_210_ENGLISH_ATIS_TEST = _UD_210_HOME + "UD_English-Atis/en_atis-ud-test.conllu"
"UD_210 test set of ENGLISH_ATIS."
UD_210_ENGLISH_ESL_TRAIN = _UD_210_HOME + "UD_English-ESL/en_esl-ud-train.conllu"
"UD_210 train set of ENGLISH_ESL."
UD_210_ENGLISH_ESL_DEV = _UD_210_HOME + "UD_English-ESL/en_esl-ud-dev.conllu"
"UD_210 dev set of ENGLISH_ESL."
UD_210_ENGLISH_ESL_TEST = _UD_210_HOME + "UD_English-ESL/en_esl-ud-test.conllu"
"UD_210 test set of ENGLISH_ESL."
UD_210_ENGLISH_EWT_TRAIN = _UD_210_HOME + "UD_English-EWT/en_ewt-ud-train.conllu"
"UD_210 train set of ENGLISH_EWT."
UD_210_ENGLISH_EWT_DEV = _UD_210_HOME + "UD_English-EWT/en_ewt-ud-dev.conllu"
"UD_210 dev set of ENGLISH_EWT."
UD_210_ENGLISH_EWT_TEST = _UD_210_HOME + "UD_English-EWT/en_ewt-ud-test.conllu"
"UD_210 test set of ENGLISH_EWT."
UD_210_ENGLISH_GUM_TRAIN = _UD_210_HOME + "UD_English-GUM/en_gum-ud-train.conllu"
"UD_210 train set of ENGLISH_GUM."
UD_210_ENGLISH_GUM_DEV = _UD_210_HOME + "UD_English-GUM/en_gum-ud-dev.conllu"
"UD_210 dev set of ENGLISH_GUM."
UD_210_ENGLISH_GUM_TEST = _UD_210_HOME + "UD_English-GUM/en_gum-ud-test.conllu"
"UD_210 test set of ENGLISH_GUM."
UD_210_ENGLISH_GUMREDDIT_TRAIN = _UD_210_HOME + "UD_English-GUMReddit/en_gumreddit-ud-train.conllu"
"UD_210 train set of ENGLISH_GUMREDDIT."
UD_210_ENGLISH_GUMREDDIT_DEV = _UD_210_HOME + "UD_English-GUMReddit/en_gumreddit-ud-dev.conllu"
"UD_210 dev set of ENGLISH_GUMREDDIT."
UD_210_ENGLISH_GUMREDDIT_TEST = _UD_210_HOME + "UD_English-GUMReddit/en_gumreddit-ud-test.conllu"
"UD_210 test set of ENGLISH_GUMREDDIT."
UD_210_ENGLISH_LINES_TRAIN = _UD_210_HOME + "UD_English-LinES/en_lines-ud-train.conllu"
"UD_210 train set of ENGLISH_LINES."
UD_210_ENGLISH_LINES_DEV = _UD_210_HOME + "UD_English-LinES/en_lines-ud-dev.conllu"
"UD_210 dev set of ENGLISH_LINES."
UD_210_ENGLISH_LINES_TEST = _UD_210_HOME + "UD_English-LinES/en_lines-ud-test.conllu"
"UD_210 test set of ENGLISH_LINES."
UD_210_ENGLISH_PUD_TEST = _UD_210_HOME + "UD_English-PUD/en_pud-ud-test.conllu"
"UD_210 test set of ENGLISH_PUD."
UD_210_ENGLISH_PARTUT_TRAIN = _UD_210_HOME + "UD_English-ParTUT/en_partut-ud-train.conllu"
"UD_210 train set of ENGLISH_PARTUT."
UD_210_ENGLISH_PARTUT_DEV = _UD_210_HOME + "UD_English-ParTUT/en_partut-ud-dev.conllu"
"UD_210 dev set of ENGLISH_PARTUT."
UD_210_ENGLISH_PARTUT_TEST = _UD_210_HOME + "UD_English-ParTUT/en_partut-ud-test.conllu"
"UD_210 test set of ENGLISH_PARTUT."
UD_210_ENGLISH_PRONOUNS_TEST = _UD_210_HOME + "UD_English-Pronouns/en_pronouns-ud-test.conllu"
"UD_210 test set of ENGLISH_PRONOUNS."
UD_210_ERZYA_JR_TEST = _UD_210_HOME + "UD_Erzya-JR/myv_jr-ud-test.conllu"
"UD_210 test set of ERZYA_JR."
UD_210_ESTONIAN_EDT_TRAIN = _UD_210_HOME + "UD_Estonian-EDT/et_edt-ud-train.conllu"
"UD_210 train set of ESTONIAN_EDT."
UD_210_ESTONIAN_EDT_DEV = _UD_210_HOME + "UD_Estonian-EDT/et_edt-ud-dev.conllu"
"UD_210 dev set of ESTONIAN_EDT."
UD_210_ESTONIAN_EDT_TEST = _UD_210_HOME + "UD_Estonian-EDT/et_edt-ud-test.conllu"
"UD_210 test set of ESTONIAN_EDT."
UD_210_ESTONIAN_EWT_TRAIN = _UD_210_HOME + "UD_Estonian-EWT/et_ewt-ud-train.conllu"
"UD_210 train set of ESTONIAN_EWT."
UD_210_ESTONIAN_EWT_DEV = _UD_210_HOME + "UD_Estonian-EWT/et_ewt-ud-dev.conllu"
"UD_210 dev set of ESTONIAN_EWT."
UD_210_ESTONIAN_EWT_TEST = _UD_210_HOME + "UD_Estonian-EWT/et_ewt-ud-test.conllu"
"UD_210 test set of ESTONIAN_EWT."
UD_210_FAROESE_FARPAHC_TRAIN = _UD_210_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-train.conllu"
"UD_210 train set of FAROESE_FARPAHC."
UD_210_FAROESE_FARPAHC_DEV = _UD_210_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-dev.conllu"
"UD_210 dev set of FAROESE_FARPAHC."
UD_210_FAROESE_FARPAHC_TEST = _UD_210_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-test.conllu"
"UD_210 test set of FAROESE_FARPAHC."
UD_210_FAROESE_OFT_TEST = _UD_210_HOME + "UD_Faroese-OFT/fo_oft-ud-test.conllu"
"UD_210 test set of FAROESE_OFT."
UD_210_FINNISH_FTB_TRAIN = _UD_210_HOME + "UD_Finnish-FTB/fi_ftb-ud-train.conllu"
"UD_210 train set of FINNISH_FTB."
UD_210_FINNISH_FTB_DEV = _UD_210_HOME + "UD_Finnish-FTB/fi_ftb-ud-dev.conllu"
"UD_210 dev set of FINNISH_FTB."
UD_210_FINNISH_FTB_TEST = _UD_210_HOME + "UD_Finnish-FTB/fi_ftb-ud-test.conllu"
"UD_210 test set of FINNISH_FTB."
UD_210_FINNISH_OOD_TEST = _UD_210_HOME + "UD_Finnish-OOD/fi_ood-ud-test.conllu"
"UD_210 test set of FINNISH_OOD."
UD_210_FINNISH_PUD_TEST = _UD_210_HOME + "UD_Finnish-PUD/fi_pud-ud-test.conllu"
"UD_210 test set of FINNISH_PUD."
UD_210_FINNISH_TDT_TRAIN = _UD_210_HOME + "UD_Finnish-TDT/fi_tdt-ud-train.conllu"
"UD_210 train set of FINNISH_TDT."
UD_210_FINNISH_TDT_DEV = _UD_210_HOME + "UD_Finnish-TDT/fi_tdt-ud-dev.conllu"
"UD_210 dev set of FINNISH_TDT."
UD_210_FINNISH_TDT_TEST = _UD_210_HOME + "UD_Finnish-TDT/fi_tdt-ud-test.conllu"
"UD_210 test set of FINNISH_TDT."
UD_210_FRENCH_FQB_TEST = _UD_210_HOME + "UD_French-FQB/fr_fqb-ud-test.conllu"
"UD_210 test set of FRENCH_FQB."
UD_210_FRENCH_FTB_TRAIN = _UD_210_HOME + "UD_French-FTB/fr_ftb-ud-train.conllu"
"UD_210 train set of FRENCH_FTB."
UD_210_FRENCH_FTB_DEV = _UD_210_HOME + "UD_French-FTB/fr_ftb-ud-dev.conllu"
"UD_210 dev set of FRENCH_FTB."
UD_210_FRENCH_FTB_TEST = _UD_210_HOME + "UD_French-FTB/fr_ftb-ud-test.conllu"
"UD_210 test set of FRENCH_FTB."
UD_210_FRENCH_GSD_TRAIN = _UD_210_HOME + "UD_French-GSD/fr_gsd-ud-train.conllu"
"UD_210 train set of FRENCH_GSD."
UD_210_FRENCH_GSD_DEV = _UD_210_HOME + "UD_French-GSD/fr_gsd-ud-dev.conllu"
"UD_210 dev set of FRENCH_GSD."
UD_210_FRENCH_GSD_TEST = _UD_210_HOME + "UD_French-GSD/fr_gsd-ud-test.conllu"
"UD_210 test set of FRENCH_GSD."
UD_210_FRENCH_PUD_TEST = _UD_210_HOME + "UD_French-PUD/fr_pud-ud-test.conllu"
"UD_210 test set of FRENCH_PUD."
UD_210_FRENCH_PARTUT_TRAIN = _UD_210_HOME + "UD_French-ParTUT/fr_partut-ud-train.conllu"
"UD_210 train set of FRENCH_PARTUT."
UD_210_FRENCH_PARTUT_DEV = _UD_210_HOME + "UD_French-ParTUT/fr_partut-ud-dev.conllu"
"UD_210 dev set of FRENCH_PARTUT."
UD_210_FRENCH_PARTUT_TEST = _UD_210_HOME + "UD_French-ParTUT/fr_partut-ud-test.conllu"
"UD_210 test set of FRENCH_PARTUT."
UD_210_FRENCH_PARISSTORIES_TRAIN = _UD_210_HOME + "UD_French-ParisStories/fr_parisstories-ud-train.conllu"
"UD_210 train set of FRENCH_PARISSTORIES."
UD_210_FRENCH_PARISSTORIES_TEST = _UD_210_HOME + "UD_French-ParisStories/fr_parisstories-ud-test.conllu"
"UD_210 test set of FRENCH_PARISSTORIES."
UD_210_FRENCH_RHAPSODIE_TRAIN = _UD_210_HOME + "UD_French-Rhapsodie/fr_rhapsodie-ud-train.conllu"
"UD_210 train set of FRENCH_RHAPSODIE."
UD_210_FRENCH_RHAPSODIE_DEV = _UD_210_HOME + "UD_French-Rhapsodie/fr_rhapsodie-ud-dev.conllu"
"UD_210 dev set of FRENCH_RHAPSODIE."
UD_210_FRENCH_RHAPSODIE_TEST = _UD_210_HOME + "UD_French-Rhapsodie/fr_rhapsodie-ud-test.conllu"
"UD_210 test set of FRENCH_RHAPSODIE."
UD_210_FRENCH_SEQUOIA_TRAIN = _UD_210_HOME + "UD_French-Sequoia/fr_sequoia-ud-train.conllu"
"UD_210 train set of FRENCH_SEQUOIA."
UD_210_FRENCH_SEQUOIA_DEV = _UD_210_HOME + "UD_French-Sequoia/fr_sequoia-ud-dev.conllu"
"UD_210 dev set of FRENCH_SEQUOIA."
UD_210_FRENCH_SEQUOIA_TEST = _UD_210_HOME + "UD_French-Sequoia/fr_sequoia-ud-test.conllu"
"UD_210 test set of FRENCH_SEQUOIA."
UD_210_FRISIAN_DUTCH_FAME_TEST = _UD_210_HOME + "UD_Frisian_Dutch-Fame/qfn_fame-ud-test.conllu"
"UD_210 test set of FRISIAN_DUTCH_FAME."
UD_210_GALICIAN_CTG_TRAIN = _UD_210_HOME + "UD_Galician-CTG/gl_ctg-ud-train.conllu"
"UD_210 train set of GALICIAN_CTG."
UD_210_GALICIAN_CTG_DEV = _UD_210_HOME + "UD_Galician-CTG/gl_ctg-ud-dev.conllu"
"UD_210 dev set of GALICIAN_CTG."
UD_210_GALICIAN_CTG_TEST = _UD_210_HOME + "UD_Galician-CTG/gl_ctg-ud-test.conllu"
"UD_210 test set of GALICIAN_CTG."
UD_210_GALICIAN_TREEGAL_TRAIN = _UD_210_HOME + "UD_Galician-TreeGal/gl_treegal-ud-train.conllu"
"UD_210 train set of GALICIAN_TREEGAL."
UD_210_GALICIAN_TREEGAL_TEST = _UD_210_HOME + "UD_Galician-TreeGal/gl_treegal-ud-test.conllu"
"UD_210 test set of GALICIAN_TREEGAL."
UD_210_GERMAN_GSD_TRAIN = _UD_210_HOME + "UD_German-GSD/de_gsd-ud-train.conllu"
"UD_210 train set of GERMAN_GSD."
UD_210_GERMAN_GSD_DEV = _UD_210_HOME + "UD_German-GSD/de_gsd-ud-dev.conllu"
"UD_210 dev set of GERMAN_GSD."
UD_210_GERMAN_GSD_TEST = _UD_210_HOME + "UD_German-GSD/de_gsd-ud-test.conllu"
"UD_210 test set of GERMAN_GSD."
UD_210_GERMAN_HDT_TRAIN = _UD_210_HOME + "UD_German-HDT/de_hdt-ud-train.conllu"
"UD_210 train set of GERMAN_HDT."
UD_210_GERMAN_HDT_DEV = _UD_210_HOME + "UD_German-HDT/de_hdt-ud-dev.conllu"
"UD_210 dev set of GERMAN_HDT."
UD_210_GERMAN_HDT_TEST = _UD_210_HOME + "UD_German-HDT/de_hdt-ud-test.conllu"
"UD_210 test set of GERMAN_HDT."
UD_210_GERMAN_LIT_TEST = _UD_210_HOME + "UD_German-LIT/de_lit-ud-test.conllu"
"UD_210 test set of GERMAN_LIT."
UD_210_GERMAN_PUD_TEST = _UD_210_HOME + "UD_German-PUD/de_pud-ud-test.conllu"
"UD_210 test set of GERMAN_PUD."
UD_210_GOTHIC_PROIEL_TRAIN = _UD_210_HOME + "UD_Gothic-PROIEL/got_proiel-ud-train.conllu"
"UD_210 train set of GOTHIC_PROIEL."
UD_210_GOTHIC_PROIEL_DEV = _UD_210_HOME + "UD_Gothic-PROIEL/got_proiel-ud-dev.conllu"
"UD_210 dev set of GOTHIC_PROIEL."
UD_210_GOTHIC_PROIEL_TEST = _UD_210_HOME + "UD_Gothic-PROIEL/got_proiel-ud-test.conllu"
"UD_210 test set of GOTHIC_PROIEL."
UD_210_GREEK_GDT_TRAIN = _UD_210_HOME + "UD_Greek-GDT/el_gdt-ud-train.conllu"
"UD_210 train set of GREEK_GDT."
UD_210_GREEK_GDT_DEV = _UD_210_HOME + "UD_Greek-GDT/el_gdt-ud-dev.conllu"
"UD_210 dev set of GREEK_GDT."
UD_210_GREEK_GDT_TEST = _UD_210_HOME + "UD_Greek-GDT/el_gdt-ud-test.conllu"
"UD_210 test set of GREEK_GDT."
UD_210_GUAJAJARA_TUDET_TEST = _UD_210_HOME + "UD_Guajajara-TuDeT/gub_tudet-ud-test.conllu"
"UD_210 test set of GUAJAJARA_TUDET."
UD_210_GUARANI_OLDTUDET_TEST = _UD_210_HOME + "UD_Guarani-OldTuDeT/gn_oldtudet-ud-test.conllu"
"UD_210 test set of GUARANI_OLDTUDET."
UD_210_HEBREW_HTB_TRAIN = _UD_210_HOME + "UD_Hebrew-HTB/he_htb-ud-train.conllu"
"UD_210 train set of HEBREW_HTB."
UD_210_HEBREW_HTB_DEV = _UD_210_HOME + "UD_Hebrew-HTB/he_htb-ud-dev.conllu"
"UD_210 dev set of HEBREW_HTB."
UD_210_HEBREW_HTB_TEST = _UD_210_HOME + "UD_Hebrew-HTB/he_htb-ud-test.conllu"
"UD_210 test set of HEBREW_HTB."
UD_210_HEBREW_IAHLTWIKI_TRAIN = _UD_210_HOME + "UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-train.conllu"
"UD_210 train set of HEBREW_IAHLTWIKI."
UD_210_HEBREW_IAHLTWIKI_DEV = _UD_210_HOME + "UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-dev.conllu"
"UD_210 dev set of HEBREW_IAHLTWIKI."
UD_210_HEBREW_IAHLTWIKI_TEST = _UD_210_HOME + "UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-test.conllu"
"UD_210 test set of HEBREW_IAHLTWIKI."
UD_210_HINDI_HDTB_TRAIN = _UD_210_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-train.conllu"
"UD_210 train set of HINDI_HDTB."
UD_210_HINDI_HDTB_DEV = _UD_210_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu"
"UD_210 dev set of HINDI_HDTB."
UD_210_HINDI_HDTB_TEST = _UD_210_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-test.conllu"
"UD_210 test set of HINDI_HDTB."
UD_210_HINDI_PUD_TEST = _UD_210_HOME + "UD_Hindi-PUD/hi_pud-ud-test.conllu"
"UD_210 test set of HINDI_PUD."
UD_210_HINDI_ENGLISH_HIENCS_TRAIN = _UD_210_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-train.conllu"
"UD_210 train set of HINDI_ENGLISH_HIENCS."
UD_210_HINDI_ENGLISH_HIENCS_DEV = _UD_210_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-dev.conllu"
"UD_210 dev set of HINDI_ENGLISH_HIENCS."
UD_210_HINDI_ENGLISH_HIENCS_TEST = _UD_210_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-test.conllu"
"UD_210 test set of HINDI_ENGLISH_HIENCS."
UD_210_HITTITE_HITTB_TEST = _UD_210_HOME + "UD_Hittite-HitTB/hit_hittb-ud-test.conllu"
"UD_210 test set of HITTITE_HITTB."
UD_210_HUNGARIAN_SZEGED_TRAIN = _UD_210_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-train.conllu"
"UD_210 train set of HUNGARIAN_SZEGED."
UD_210_HUNGARIAN_SZEGED_DEV = _UD_210_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu"
"UD_210 dev set of HUNGARIAN_SZEGED."
UD_210_HUNGARIAN_SZEGED_TEST = _UD_210_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-test.conllu"
"UD_210 test set of HUNGARIAN_SZEGED."
UD_210_ICELANDIC_ICEPAHC_TRAIN = _UD_210_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-train.conllu"
"UD_210 train set of ICELANDIC_ICEPAHC."
UD_210_ICELANDIC_ICEPAHC_DEV = _UD_210_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-dev.conllu"
"UD_210 dev set of ICELANDIC_ICEPAHC."
UD_210_ICELANDIC_ICEPAHC_TEST = _UD_210_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-test.conllu"
"UD_210 test set of ICELANDIC_ICEPAHC."
UD_210_ICELANDIC_MODERN_TRAIN = _UD_210_HOME + "UD_Icelandic-Modern/is_modern-ud-train.conllu"
"UD_210 train set of ICELANDIC_MODERN."
UD_210_ICELANDIC_MODERN_DEV = _UD_210_HOME + "UD_Icelandic-Modern/is_modern-ud-dev.conllu"
"UD_210 dev set of ICELANDIC_MODERN."
UD_210_ICELANDIC_MODERN_TEST = _UD_210_HOME + "UD_Icelandic-Modern/is_modern-ud-test.conllu"
"UD_210 test set of ICELANDIC_MODERN."
UD_210_ICELANDIC_PUD_TEST = _UD_210_HOME + "UD_Icelandic-PUD/is_pud-ud-test.conllu"
"UD_210 test set of ICELANDIC_PUD."
UD_210_INDONESIAN_CSUI_TRAIN = _UD_210_HOME + "UD_Indonesian-CSUI/id_csui-ud-train.conllu"
"UD_210 train set of INDONESIAN_CSUI."
UD_210_INDONESIAN_CSUI_TEST = _UD_210_HOME + "UD_Indonesian-CSUI/id_csui-ud-test.conllu"
"UD_210 test set of INDONESIAN_CSUI."
UD_210_INDONESIAN_GSD_TRAIN = _UD_210_HOME + "UD_Indonesian-GSD/id_gsd-ud-train.conllu"
"UD_210 train set of INDONESIAN_GSD."
UD_210_INDONESIAN_GSD_DEV = _UD_210_HOME + "UD_Indonesian-GSD/id_gsd-ud-dev.conllu"
"UD_210 dev set of INDONESIAN_GSD."
UD_210_INDONESIAN_GSD_TEST = _UD_210_HOME + "UD_Indonesian-GSD/id_gsd-ud-test.conllu"
"UD_210 test set of INDONESIAN_GSD."
UD_210_INDONESIAN_PUD_TEST = _UD_210_HOME + "UD_Indonesian-PUD/id_pud-ud-test.conllu"
"UD_210 test set of INDONESIAN_PUD."
UD_210_IRISH_IDT_TRAIN = _UD_210_HOME + "UD_Irish-IDT/ga_idt-ud-train.conllu"
"UD_210 train set of IRISH_IDT."
UD_210_IRISH_IDT_DEV = _UD_210_HOME + "UD_Irish-IDT/ga_idt-ud-dev.conllu"
"UD_210 dev set of IRISH_IDT."
UD_210_IRISH_IDT_TEST = _UD_210_HOME + "UD_Irish-IDT/ga_idt-ud-test.conllu"
"UD_210 test set of IRISH_IDT."
UD_210_IRISH_TWITTIRISH_TEST = _UD_210_HOME + "UD_Irish-TwittIrish/ga_twittirish-ud-test.conllu"
"UD_210 test set of IRISH_TWITTIRISH."
UD_210_ITALIAN_ISDT_TRAIN = _UD_210_HOME + "UD_Italian-ISDT/it_isdt-ud-train.conllu"
"UD_210 train set of ITALIAN_ISDT."
UD_210_ITALIAN_ISDT_DEV = _UD_210_HOME + "UD_Italian-ISDT/it_isdt-ud-dev.conllu"
"UD_210 dev set of ITALIAN_ISDT."
UD_210_ITALIAN_ISDT_TEST = _UD_210_HOME + "UD_Italian-ISDT/it_isdt-ud-test.conllu"
"UD_210 test set of ITALIAN_ISDT."
UD_210_ITALIAN_MARKIT_TRAIN = _UD_210_HOME + "UD_Italian-MarkIT/it_markit-ud-train.conllu"
"UD_210 train set of ITALIAN_MARKIT."
UD_210_ITALIAN_MARKIT_DEV = _UD_210_HOME + "UD_Italian-MarkIT/it_markit-ud-dev.conllu"
"UD_210 dev set of ITALIAN_MARKIT."
UD_210_ITALIAN_MARKIT_TEST = _UD_210_HOME + "UD_Italian-MarkIT/it_markit-ud-test.conllu"
"UD_210 test set of ITALIAN_MARKIT."
UD_210_ITALIAN_PUD_TEST = _UD_210_HOME + "UD_Italian-PUD/it_pud-ud-test.conllu"
"UD_210 test set of ITALIAN_PUD."
UD_210_ITALIAN_PARTUT_TRAIN = _UD_210_HOME + "UD_Italian-ParTUT/it_partut-ud-train.conllu"
"UD_210 train set of ITALIAN_PARTUT."
UD_210_ITALIAN_PARTUT_DEV = _UD_210_HOME + "UD_Italian-ParTUT/it_partut-ud-dev.conllu"
"UD_210 dev set of ITALIAN_PARTUT."
UD_210_ITALIAN_PARTUT_TEST = _UD_210_HOME + "UD_Italian-ParTUT/it_partut-ud-test.conllu"
"UD_210 test set of ITALIAN_PARTUT."
UD_210_ITALIAN_POSTWITA_TRAIN = _UD_210_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-train.conllu"
"UD_210 train set of ITALIAN_POSTWITA."
UD_210_ITALIAN_POSTWITA_DEV = _UD_210_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu"
"UD_210 dev set of ITALIAN_POSTWITA."
UD_210_ITALIAN_POSTWITA_TEST = _UD_210_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-test.conllu"
"UD_210 test set of ITALIAN_POSTWITA."
UD_210_ITALIAN_TWITTIRO_TRAIN = _UD_210_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-train.conllu"
"UD_210 train set of ITALIAN_TWITTIRO."
UD_210_ITALIAN_TWITTIRO_DEV = _UD_210_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-dev.conllu"
"UD_210 dev set of ITALIAN_TWITTIRO."
UD_210_ITALIAN_TWITTIRO_TEST = _UD_210_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-test.conllu"
"UD_210 test set of ITALIAN_TWITTIRO."
UD_210_ITALIAN_VIT_TRAIN = _UD_210_HOME + "UD_Italian-VIT/it_vit-ud-train.conllu"
"UD_210 train set of ITALIAN_VIT."
UD_210_ITALIAN_VIT_DEV = _UD_210_HOME + "UD_Italian-VIT/it_vit-ud-dev.conllu"
"UD_210 dev set of ITALIAN_VIT."
UD_210_ITALIAN_VIT_TEST = _UD_210_HOME + "UD_Italian-VIT/it_vit-ud-test.conllu"
"UD_210 test set of ITALIAN_VIT."
UD_210_ITALIAN_VALICO_TEST = _UD_210_HOME + "UD_Italian-Valico/it_valico-ud-test.conllu"
"UD_210 test set of ITALIAN_VALICO."
UD_210_JAPANESE_BCCWJ_TRAIN = _UD_210_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-train.conllu"
"UD_210 train set of JAPANESE_BCCWJ."
UD_210_JAPANESE_BCCWJ_DEV = _UD_210_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-dev.conllu"
"UD_210 dev set of JAPANESE_BCCWJ."
UD_210_JAPANESE_BCCWJ_TEST = _UD_210_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-test.conllu"
"UD_210 test set of JAPANESE_BCCWJ."
UD_210_JAPANESE_BCCWJLUW_TRAIN = _UD_210_HOME + "UD_Japanese-BCCWJLUW/ja_bccwjluw-ud-train.conllu"
"UD_210 train set of JAPANESE_BCCWJLUW."
UD_210_JAPANESE_BCCWJLUW_DEV = _UD_210_HOME + "UD_Japanese-BCCWJLUW/ja_bccwjluw-ud-dev.conllu"
"UD_210 dev set of JAPANESE_BCCWJLUW."
UD_210_JAPANESE_BCCWJLUW_TEST = _UD_210_HOME + "UD_Japanese-BCCWJLUW/ja_bccwjluw-ud-test.conllu"
"UD_210 test set of JAPANESE_BCCWJLUW."
UD_210_JAPANESE_GSD_TRAIN = _UD_210_HOME + "UD_Japanese-GSD/ja_gsd-ud-train.conllu"
"UD_210 train set of JAPANESE_GSD."
UD_210_JAPANESE_GSD_DEV = _UD_210_HOME + "UD_Japanese-GSD/ja_gsd-ud-dev.conllu"
"UD_210 dev set of JAPANESE_GSD."
UD_210_JAPANESE_GSD_TEST = _UD_210_HOME + "UD_Japanese-GSD/ja_gsd-ud-test.conllu"
"UD_210 test set of JAPANESE_GSD."
UD_210_JAPANESE_GSDLUW_TRAIN = _UD_210_HOME + "UD_Japanese-GSDLUW/ja_gsdluw-ud-train.conllu"
"UD_210 train set of JAPANESE_GSDLUW."
UD_210_JAPANESE_GSDLUW_DEV = _UD_210_HOME + "UD_Japanese-GSDLUW/ja_gsdluw-ud-dev.conllu"
"UD_210 dev set of JAPANESE_GSDLUW."
UD_210_JAPANESE_GSDLUW_TEST = _UD_210_HOME + "UD_Japanese-GSDLUW/ja_gsdluw-ud-test.conllu"
"UD_210 test set of JAPANESE_GSDLUW."
UD_210_JAPANESE_MODERN_TEST = _UD_210_HOME + "UD_Japanese-Modern/ja_modern-ud-test.conllu"
"UD_210 test set of JAPANESE_MODERN."
UD_210_JAPANESE_PUD_TEST = _UD_210_HOME + "UD_Japanese-PUD/ja_pud-ud-test.conllu"
"UD_210 test set of JAPANESE_PUD."
UD_210_JAPANESE_PUDLUW_TEST = _UD_210_HOME + "UD_Japanese-PUDLUW/ja_pudluw-ud-test.conllu"
"UD_210 test set of JAPANESE_PUDLUW."
UD_210_JAVANESE_CSUI_TEST = _UD_210_HOME + "UD_Javanese-CSUI/jv_csui-ud-test.conllu"
"UD_210 test set of JAVANESE_CSUI."
UD_210_KAAPOR_TUDET_TEST = _UD_210_HOME + "UD_Kaapor-TuDeT/urb_tudet-ud-test.conllu"
"UD_210 test set of KAAPOR_TUDET."
UD_210_KANGRI_KDTB_TEST = _UD_210_HOME + "UD_Kangri-KDTB/xnr_kdtb-ud-test.conllu"
"UD_210 test set of KANGRI_KDTB."
UD_210_KARELIAN_KKPP_TEST = _UD_210_HOME + "UD_Karelian-KKPP/krl_kkpp-ud-test.conllu"
"UD_210 test set of KARELIAN_KKPP."
UD_210_KARO_TUDET_TEST = _UD_210_HOME + "UD_Karo-TuDeT/arr_tudet-ud-test.conllu"
"UD_210 test set of KARO_TUDET."
UD_210_KAZAKH_KTB_TRAIN = _UD_210_HOME + "UD_Kazakh-KTB/kk_ktb-ud-train.conllu"
"UD_210 train set of KAZAKH_KTB."
UD_210_KAZAKH_KTB_TEST = _UD_210_HOME + "UD_Kazakh-KTB/kk_ktb-ud-test.conllu"
"UD_210 test set of KAZAKH_KTB."
UD_210_KHUNSARI_AHA_TEST = _UD_210_HOME + "UD_Khunsari-AHA/kfm_aha-ud-test.conllu"
"UD_210 test set of KHUNSARI_AHA."
UD_210_KICHE_IU_TEST = _UD_210_HOME + "UD_Kiche-IU/quc_iu-ud-test.conllu"
"UD_210 test set of KICHE_IU."
UD_210_KOMI_PERMYAK_UH_TEST = _UD_210_HOME + "UD_Komi_Permyak-UH/koi_uh-ud-test.conllu"
"UD_210 test set of KOMI_PERMYAK_UH."
UD_210_KOMI_ZYRIAN_IKDP_TEST = _UD_210_HOME + "UD_Komi_Zyrian-IKDP/kpv_ikdp-ud-test.conllu"
"UD_210 test set of KOMI_ZYRIAN_IKDP."
UD_210_KOMI_ZYRIAN_LATTICE_TEST = _UD_210_HOME + "UD_Komi_Zyrian-Lattice/kpv_lattice-ud-test.conllu"
"UD_210 test set of KOMI_ZYRIAN_LATTICE."
UD_210_KOREAN_GSD_TRAIN = _UD_210_HOME + "UD_Korean-GSD/ko_gsd-ud-train.conllu"
"UD_210 train set of KOREAN_GSD."
UD_210_KOREAN_GSD_DEV = _UD_210_HOME + "UD_Korean-GSD/ko_gsd-ud-dev.conllu"
"UD_210 dev set of KOREAN_GSD."
UD_210_KOREAN_GSD_TEST = _UD_210_HOME + "UD_Korean-GSD/ko_gsd-ud-test.conllu"
"UD_210 test set of KOREAN_GSD."
UD_210_KOREAN_KAIST_TRAIN = _UD_210_HOME + "UD_Korean-Kaist/ko_kaist-ud-train.conllu"
"UD_210 train set of KOREAN_KAIST."
UD_210_KOREAN_KAIST_DEV = _UD_210_HOME + "UD_Korean-Kaist/ko_kaist-ud-dev.conllu"
"UD_210 dev set of KOREAN_KAIST."
UD_210_KOREAN_KAIST_TEST = _UD_210_HOME + "UD_Korean-Kaist/ko_kaist-ud-test.conllu"
"UD_210 test set of KOREAN_KAIST."
UD_210_KOREAN_PUD_TEST = _UD_210_HOME + "UD_Korean-PUD/ko_pud-ud-test.conllu"
"UD_210 test set of KOREAN_PUD."
UD_210_KURMANJI_MG_TRAIN = _UD_210_HOME + "UD_Kurmanji-MG/kmr_mg-ud-train.conllu"
"UD_210 train set of KURMANJI_MG."
UD_210_KURMANJI_MG_TEST = _UD_210_HOME + "UD_Kurmanji-MG/kmr_mg-ud-test.conllu"
"UD_210 test set of KURMANJI_MG."
UD_210_LATIN_ITTB_TRAIN = _UD_210_HOME + "UD_Latin-ITTB/la_ittb-ud-train.conllu"
"UD_210 train set of LATIN_ITTB."
UD_210_LATIN_ITTB_DEV = _UD_210_HOME + "UD_Latin-ITTB/la_ittb-ud-dev.conllu"
"UD_210 dev set of LATIN_ITTB."
UD_210_LATIN_ITTB_TEST = _UD_210_HOME + "UD_Latin-ITTB/la_ittb-ud-test.conllu"
"UD_210 test set of LATIN_ITTB."
UD_210_LATIN_LLCT_TRAIN = _UD_210_HOME + "UD_Latin-LLCT/la_llct-ud-train.conllu"
"UD_210 train set of LATIN_LLCT."
UD_210_LATIN_LLCT_DEV = _UD_210_HOME + "UD_Latin-LLCT/la_llct-ud-dev.conllu"
"UD_210 dev set of LATIN_LLCT."
UD_210_LATIN_LLCT_TEST = _UD_210_HOME + "UD_Latin-LLCT/la_llct-ud-test.conllu"
"UD_210 test set of LATIN_LLCT."
UD_210_LATIN_PROIEL_TRAIN = _UD_210_HOME + "UD_Latin-PROIEL/la_proiel-ud-train.conllu"
"UD_210 train set of LATIN_PROIEL."
UD_210_LATIN_PROIEL_DEV = _UD_210_HOME + "UD_Latin-PROIEL/la_proiel-ud-dev.conllu"
"UD_210 dev set of LATIN_PROIEL."
UD_210_LATIN_PROIEL_TEST = _UD_210_HOME + "UD_Latin-PROIEL/la_proiel-ud-test.conllu"
"UD_210 test set of LATIN_PROIEL."
UD_210_LATIN_PERSEUS_TRAIN = _UD_210_HOME + "UD_Latin-Perseus/la_perseus-ud-train.conllu"
"UD_210 train set of LATIN_PERSEUS."
UD_210_LATIN_PERSEUS_TEST = _UD_210_HOME + "UD_Latin-Perseus/la_perseus-ud-test.conllu"
"UD_210 test set of LATIN_PERSEUS."
UD_210_LATIN_UDANTE_TRAIN = _UD_210_HOME + "UD_Latin-UDante/la_udante-ud-train.conllu"
"UD_210 train set of LATIN_UDANTE."
UD_210_LATIN_UDANTE_DEV = _UD_210_HOME + "UD_Latin-UDante/la_udante-ud-dev.conllu"
"UD_210 dev set of LATIN_UDANTE."
UD_210_LATIN_UDANTE_TEST = _UD_210_HOME + "UD_Latin-UDante/la_udante-ud-test.conllu"
"UD_210 test set of LATIN_UDANTE."
UD_210_LATVIAN_LVTB_TRAIN = _UD_210_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-train.conllu"
"UD_210 train set of LATVIAN_LVTB."
UD_210_LATVIAN_LVTB_DEV = _UD_210_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu"
"UD_210 dev set of LATVIAN_LVTB."
UD_210_LATVIAN_LVTB_TEST = _UD_210_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-test.conllu"
"UD_210 test set of LATVIAN_LVTB."
UD_210_LIGURIAN_GLT_TRAIN = _UD_210_HOME + "UD_Ligurian-GLT/lij_glt-ud-train.conllu"
"UD_210 train set of LIGURIAN_GLT."
UD_210_LIGURIAN_GLT_TEST = _UD_210_HOME + "UD_Ligurian-GLT/lij_glt-ud-test.conllu"
"UD_210 test set of LIGURIAN_GLT."
UD_210_LITHUANIAN_ALKSNIS_TRAIN = _UD_210_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-train.conllu"
"UD_210 train set of LITHUANIAN_ALKSNIS."
UD_210_LITHUANIAN_ALKSNIS_DEV = _UD_210_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-dev.conllu"
"UD_210 dev set of LITHUANIAN_ALKSNIS."
UD_210_LITHUANIAN_ALKSNIS_TEST = _UD_210_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-test.conllu"
"UD_210 test set of LITHUANIAN_ALKSNIS."
UD_210_LITHUANIAN_HSE_TRAIN = _UD_210_HOME + "UD_Lithuanian-HSE/lt_hse-ud-train.conllu"
"UD_210 train set of LITHUANIAN_HSE."
UD_210_LITHUANIAN_HSE_DEV = _UD_210_HOME + "UD_Lithuanian-HSE/lt_hse-ud-dev.conllu"
"UD_210 dev set of LITHUANIAN_HSE."
UD_210_LITHUANIAN_HSE_TEST = _UD_210_HOME + "UD_Lithuanian-HSE/lt_hse-ud-test.conllu"
"UD_210 test set of LITHUANIAN_HSE."
UD_210_LIVVI_KKPP_TRAIN = _UD_210_HOME + "UD_Livvi-KKPP/olo_kkpp-ud-train.conllu"
"UD_210 train set of LIVVI_KKPP."
UD_210_LIVVI_KKPP_TEST = _UD_210_HOME + "UD_Livvi-KKPP/olo_kkpp-ud-test.conllu"
"UD_210 test set of LIVVI_KKPP."
UD_210_LOW_SAXON_LSDC_TEST = _UD_210_HOME + "UD_Low_Saxon-LSDC/nds_lsdc-ud-test.conllu"
"UD_210 test set of LOW_SAXON_LSDC."
UD_210_MADI_JARAWARA_TEST = _UD_210_HOME + "UD_Madi-Jarawara/jaa_jarawara-ud-test.conllu"
"UD_210 test set of MADI_JARAWARA."
UD_210_MAKURAP_TUDET_TEST = _UD_210_HOME + "UD_Makurap-TuDeT/mpu_tudet-ud-test.conllu"
"UD_210 test set of MAKURAP_TUDET."
UD_210_MALTESE_MUDT_TRAIN = _UD_210_HOME + "UD_Maltese-MUDT/mt_mudt-ud-train.conllu"
"UD_210 train set of MALTESE_MUDT."
UD_210_MALTESE_MUDT_DEV = _UD_210_HOME + "UD_Maltese-MUDT/mt_mudt-ud-dev.conllu"
"UD_210 dev set of MALTESE_MUDT."
UD_210_MALTESE_MUDT_TEST = _UD_210_HOME + "UD_Maltese-MUDT/mt_mudt-ud-test.conllu"
"UD_210 test set of MALTESE_MUDT."
UD_210_MANX_CADHAN_TEST = _UD_210_HOME + "UD_Manx-Cadhan/gv_cadhan-ud-test.conllu"
"UD_210 test set of MANX_CADHAN."
UD_210_MARATHI_UFAL_TRAIN = _UD_210_HOME + "UD_Marathi-UFAL/mr_ufal-ud-train.conllu"
"UD_210 train set of MARATHI_UFAL."
UD_210_MARATHI_UFAL_DEV = _UD_210_HOME + "UD_Marathi-UFAL/mr_ufal-ud-dev.conllu"
"UD_210 dev set of MARATHI_UFAL."
UD_210_MARATHI_UFAL_TEST = _UD_210_HOME + "UD_Marathi-UFAL/mr_ufal-ud-test.conllu"
"UD_210 test set of MARATHI_UFAL."
UD_210_MBYA_GUARANI_DOOLEY_TEST = _UD_210_HOME + "UD_Mbya_Guarani-Dooley/gun_dooley-ud-test.conllu"
"UD_210 test set of MBYA_GUARANI_DOOLEY."
UD_210_MBYA_GUARANI_THOMAS_TEST = _UD_210_HOME + "UD_Mbya_Guarani-Thomas/gun_thomas-ud-test.conllu"
"UD_210 test set of MBYA_GUARANI_THOMAS."
UD_210_MOKSHA_JR_TEST = _UD_210_HOME + "UD_Moksha-JR/mdf_jr-ud-test.conllu"
"UD_210 test set of MOKSHA_JR."
UD_210_MUNDURUKU_TUDET_TEST = _UD_210_HOME + "UD_Munduruku-TuDeT/myu_tudet-ud-test.conllu"
"UD_210 test set of MUNDURUKU_TUDET."
UD_210_NAIJA_NSC_TRAIN = _UD_210_HOME + "UD_Naija-NSC/pcm_nsc-ud-train.conllu"
"UD_210 train set of NAIJA_NSC."
UD_210_NAIJA_NSC_DEV = _UD_210_HOME + "UD_Naija-NSC/pcm_nsc-ud-dev.conllu"
"UD_210 dev set of NAIJA_NSC."
UD_210_NAIJA_NSC_TEST = _UD_210_HOME + "UD_Naija-NSC/pcm_nsc-ud-test.conllu"
"UD_210 test set of NAIJA_NSC."
UD_210_NAYINI_AHA_TEST = _UD_210_HOME + "UD_Nayini-AHA/nyq_aha-ud-test.conllu"
"UD_210 test set of NAYINI_AHA."
UD_210_NEAPOLITAN_RB_TEST = _UD_210_HOME + "UD_Neapolitan-RB/nap_rb-ud-test.conllu"
"UD_210 test set of NEAPOLITAN_RB."
UD_210_NORTH_SAMI_GIELLA_TRAIN = _UD_210_HOME + "UD_North_Sami-Giella/sme_giella-ud-train.conllu"
"UD_210 train set of NORTH_SAMI_GIELLA."
UD_210_NORTH_SAMI_GIELLA_TEST = _UD_210_HOME + "UD_North_Sami-Giella/sme_giella-ud-test.conllu"
"UD_210 test set of NORTH_SAMI_GIELLA."
UD_210_NORWEGIAN_BOKMAAL_TRAIN = _UD_210_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu"
"UD_210 train set of NORWEGIAN_BOKMAAL."
UD_210_NORWEGIAN_BOKMAAL_DEV = _UD_210_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu"
"UD_210 dev set of NORWEGIAN_BOKMAAL."
UD_210_NORWEGIAN_BOKMAAL_TEST = _UD_210_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-test.conllu"
"UD_210 test set of NORWEGIAN_BOKMAAL."
UD_210_NORWEGIAN_NYNORSK_TRAIN = _UD_210_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu"
"UD_210 train set of NORWEGIAN_NYNORSK."
UD_210_NORWEGIAN_NYNORSK_DEV = _UD_210_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu"
"UD_210 dev set of NORWEGIAN_NYNORSK."
UD_210_NORWEGIAN_NYNORSK_TEST = _UD_210_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-test.conllu"
"UD_210 test set of NORWEGIAN_NYNORSK."
UD_210_NORWEGIAN_NYNORSKLIA_TRAIN = _UD_210_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu"
"UD_210 train set of NORWEGIAN_NYNORSKLIA."
UD_210_NORWEGIAN_NYNORSKLIA_DEV = _UD_210_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-dev.conllu"
"UD_210 dev set of NORWEGIAN_NYNORSKLIA."
UD_210_NORWEGIAN_NYNORSKLIA_TEST = _UD_210_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-test.conllu"
"UD_210 test set of NORWEGIAN_NYNORSKLIA."
UD_210_OLD_CHURCH_SLAVONIC_PROIEL_TRAIN = _UD_210_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu"
"UD_210 train set of OLD_CHURCH_SLAVONIC_PROIEL."
UD_210_OLD_CHURCH_SLAVONIC_PROIEL_DEV = _UD_210_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu"
"UD_210 dev set of OLD_CHURCH_SLAVONIC_PROIEL."
UD_210_OLD_CHURCH_SLAVONIC_PROIEL_TEST = _UD_210_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-test.conllu"
"UD_210 test set of OLD_CHURCH_SLAVONIC_PROIEL."
UD_210_OLD_EAST_SLAVIC_BIRCHBARK_TRAIN = _UD_210_HOME + "UD_Old_East_Slavic-Birchbark/orv_birchbark-ud-train.conllu"
"UD_210 train set of OLD_EAST_SLAVIC_BIRCHBARK."
UD_210_OLD_EAST_SLAVIC_BIRCHBARK_DEV = _UD_210_HOME + "UD_Old_East_Slavic-Birchbark/orv_birchbark-ud-dev.conllu"
"UD_210 dev set of OLD_EAST_SLAVIC_BIRCHBARK."
UD_210_OLD_EAST_SLAVIC_BIRCHBARK_TEST = _UD_210_HOME + "UD_Old_East_Slavic-Birchbark/orv_birchbark-ud-test.conllu"
"UD_210 test set of OLD_EAST_SLAVIC_BIRCHBARK."
UD_210_OLD_EAST_SLAVIC_RNC_TRAIN = _UD_210_HOME + "UD_Old_East_Slavic-RNC/orv_rnc-ud-train.conllu"
"UD_210 train set of OLD_EAST_SLAVIC_RNC."
UD_210_OLD_EAST_SLAVIC_RNC_TEST = _UD_210_HOME + "UD_Old_East_Slavic-RNC/orv_rnc-ud-test.conllu"
"UD_210 test set of OLD_EAST_SLAVIC_RNC."
UD_210_OLD_EAST_SLAVIC_TOROT_TRAIN = _UD_210_HOME + "UD_Old_East_Slavic-TOROT/orv_torot-ud-train.conllu"
"UD_210 train set of OLD_EAST_SLAVIC_TOROT."
UD_210_OLD_EAST_SLAVIC_TOROT_DEV = _UD_210_HOME + "UD_Old_East_Slavic-TOROT/orv_torot-ud-dev.conllu"
"UD_210 dev set of OLD_EAST_SLAVIC_TOROT."
UD_210_OLD_EAST_SLAVIC_TOROT_TEST = _UD_210_HOME + "UD_Old_East_Slavic-TOROT/orv_torot-ud-test.conllu"
"UD_210 test set of OLD_EAST_SLAVIC_TOROT."
UD_210_OLD_FRENCH_SRCMF_TRAIN = _UD_210_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu"
"UD_210 train set of OLD_FRENCH_SRCMF."
UD_210_OLD_FRENCH_SRCMF_DEV = _UD_210_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu"
"UD_210 dev set of OLD_FRENCH_SRCMF."
UD_210_OLD_FRENCH_SRCMF_TEST = _UD_210_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-test.conllu"
"UD_210 test set of OLD_FRENCH_SRCMF."
UD_210_OLD_TURKISH_TONQQ_TEST = _UD_210_HOME + "UD_Old_Turkish-Tonqq/otk_tonqq-ud-test.conllu"
"UD_210 test set of OLD_TURKISH_TONQQ."
UD_210_PERSIAN_PERDT_TRAIN = _UD_210_HOME + "UD_Persian-PerDT/fa_perdt-ud-train.conllu"
"UD_210 train set of PERSIAN_PERDT."
UD_210_PERSIAN_PERDT_DEV = _UD_210_HOME + "UD_Persian-PerDT/fa_perdt-ud-dev.conllu"
"UD_210 dev set of PERSIAN_PERDT."
UD_210_PERSIAN_PERDT_TEST = _UD_210_HOME + "UD_Persian-PerDT/fa_perdt-ud-test.conllu"
"UD_210 test set of PERSIAN_PERDT."
UD_210_PERSIAN_SERAJI_TRAIN = _UD_210_HOME + "UD_Persian-Seraji/fa_seraji-ud-train.conllu"
"UD_210 train set of PERSIAN_SERAJI."
UD_210_PERSIAN_SERAJI_DEV = _UD_210_HOME + "UD_Persian-Seraji/fa_seraji-ud-dev.conllu"
"UD_210 dev set of PERSIAN_SERAJI."
UD_210_PERSIAN_SERAJI_TEST = _UD_210_HOME + "UD_Persian-Seraji/fa_seraji-ud-test.conllu"
"UD_210 test set of PERSIAN_SERAJI."
UD_210_POLISH_LFG_TRAIN = _UD_210_HOME + "UD_Polish-LFG/pl_lfg-ud-train.conllu"
"UD_210 train set of POLISH_LFG."
UD_210_POLISH_LFG_DEV = _UD_210_HOME + "UD_Polish-LFG/pl_lfg-ud-dev.conllu"
"UD_210 dev set of POLISH_LFG."
UD_210_POLISH_LFG_TEST = _UD_210_HOME + "UD_Polish-LFG/pl_lfg-ud-test.conllu"
"UD_210 test set of POLISH_LFG."
UD_210_POLISH_PDB_TRAIN = _UD_210_HOME + "UD_Polish-PDB/pl_pdb-ud-train.conllu"
"UD_210 train set of POLISH_PDB."
UD_210_POLISH_PDB_DEV = _UD_210_HOME + "UD_Polish-PDB/pl_pdb-ud-dev.conllu"
"UD_210 dev set of POLISH_PDB."
UD_210_POLISH_PDB_TEST = _UD_210_HOME + "UD_Polish-PDB/pl_pdb-ud-test.conllu"
"UD_210 test set of POLISH_PDB."
UD_210_POLISH_PUD_TEST = _UD_210_HOME + "UD_Polish-PUD/pl_pud-ud-test.conllu"
"UD_210 test set of POLISH_PUD."
UD_210_POMAK_PHILOTIS_TRAIN = _UD_210_HOME + "UD_Pomak-Philotis/qpm_philotis-ud-train.conllu"
"UD_210 train set of POMAK_PHILOTIS."
UD_210_POMAK_PHILOTIS_DEV = _UD_210_HOME + "UD_Pomak-Philotis/qpm_philotis-ud-dev.conllu"
"UD_210 dev set of POMAK_PHILOTIS."
UD_210_POMAK_PHILOTIS_TEST = _UD_210_HOME + "UD_Pomak-Philotis/qpm_philotis-ud-test.conllu"
"UD_210 test set of POMAK_PHILOTIS."
UD_210_PORTUGUESE_BOSQUE_TRAIN = _UD_210_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-train.conllu"
"UD_210 train set of PORTUGUESE_BOSQUE."
UD_210_PORTUGUESE_BOSQUE_DEV = _UD_210_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu"
"UD_210 dev set of PORTUGUESE_BOSQUE."
UD_210_PORTUGUESE_BOSQUE_TEST = _UD_210_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-test.conllu"
"UD_210 test set of PORTUGUESE_BOSQUE."
UD_210_PORTUGUESE_GSD_TRAIN = _UD_210_HOME + "UD_Portuguese-GSD/pt_gsd-ud-train.conllu"
"UD_210 train set of PORTUGUESE_GSD."
UD_210_PORTUGUESE_GSD_DEV = _UD_210_HOME + "UD_Portuguese-GSD/pt_gsd-ud-dev.conllu"
"UD_210 dev set of PORTUGUESE_GSD."
UD_210_PORTUGUESE_GSD_TEST = _UD_210_HOME + "UD_Portuguese-GSD/pt_gsd-ud-test.conllu"
"UD_210 test set of PORTUGUESE_GSD."
UD_210_PORTUGUESE_PUD_TEST = _UD_210_HOME + "UD_Portuguese-PUD/pt_pud-ud-test.conllu"
"UD_210 test set of PORTUGUESE_PUD."
UD_210_ROMANIAN_ART_TEST = _UD_210_HOME + "UD_Romanian-ArT/ro_art-ud-test.conllu"
"UD_210 test set of ROMANIAN_ART."
UD_210_ROMANIAN_NONSTANDARD_TRAIN = _UD_210_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu"
"UD_210 train set of ROMANIAN_NONSTANDARD."
UD_210_ROMANIAN_NONSTANDARD_DEV = _UD_210_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu"
"UD_210 dev set of ROMANIAN_NONSTANDARD."
UD_210_ROMANIAN_NONSTANDARD_TEST = _UD_210_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-test.conllu"
"UD_210 test set of ROMANIAN_NONSTANDARD."
UD_210_ROMANIAN_RRT_TRAIN = _UD_210_HOME + "UD_Romanian-RRT/ro_rrt-ud-train.conllu"
"UD_210 train set of ROMANIAN_RRT."
UD_210_ROMANIAN_RRT_DEV = _UD_210_HOME + "UD_Romanian-RRT/ro_rrt-ud-dev.conllu"
"UD_210 dev set of ROMANIAN_RRT."
UD_210_ROMANIAN_RRT_TEST = _UD_210_HOME + "UD_Romanian-RRT/ro_rrt-ud-test.conllu"
"UD_210 test set of ROMANIAN_RRT."
UD_210_ROMANIAN_SIMONERO_TRAIN = _UD_210_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-train.conllu"
"UD_210 train set of ROMANIAN_SIMONERO."
UD_210_ROMANIAN_SIMONERO_DEV = _UD_210_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-dev.conllu"
"UD_210 dev set of ROMANIAN_SIMONERO."
UD_210_ROMANIAN_SIMONERO_TEST = _UD_210_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-test.conllu"
"UD_210 test set of ROMANIAN_SIMONERO."
UD_210_RUSSIAN_GSD_TRAIN = _UD_210_HOME + "UD_Russian-GSD/ru_gsd-ud-train.conllu"
"UD_210 train set of RUSSIAN_GSD."
UD_210_RUSSIAN_GSD_DEV = _UD_210_HOME + "UD_Russian-GSD/ru_gsd-ud-dev.conllu"
"UD_210 dev set of RUSSIAN_GSD."
UD_210_RUSSIAN_GSD_TEST = _UD_210_HOME + "UD_Russian-GSD/ru_gsd-ud-test.conllu"
"UD_210 test set of RUSSIAN_GSD."
UD_210_RUSSIAN_PUD_TEST = _UD_210_HOME + "UD_Russian-PUD/ru_pud-ud-test.conllu"
"UD_210 test set of RUSSIAN_PUD."
UD_210_RUSSIAN_SYNTAGRUS_TRAIN = _UD_210_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu"
"UD_210 train set of RUSSIAN_SYNTAGRUS."
UD_210_RUSSIAN_SYNTAGRUS_DEV = _UD_210_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu"
"UD_210 dev set of RUSSIAN_SYNTAGRUS."
UD_210_RUSSIAN_SYNTAGRUS_TEST = _UD_210_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu"
"UD_210 test set of RUSSIAN_SYNTAGRUS."
UD_210_RUSSIAN_TAIGA_TRAIN = _UD_210_HOME + "UD_Russian-Taiga/ru_taiga-ud-train.conllu"
"UD_210 train set of RUSSIAN_TAIGA."
UD_210_RUSSIAN_TAIGA_DEV = _UD_210_HOME + "UD_Russian-Taiga/ru_taiga-ud-dev.conllu"
"UD_210 dev set of RUSSIAN_TAIGA."
UD_210_RUSSIAN_TAIGA_TEST = _UD_210_HOME + "UD_Russian-Taiga/ru_taiga-ud-test.conllu"
"UD_210 test set of RUSSIAN_TAIGA."
UD_210_SANSKRIT_UFAL_TEST = _UD_210_HOME + "UD_Sanskrit-UFAL/sa_ufal-ud-test.conllu"
"UD_210 test set of SANSKRIT_UFAL."
UD_210_SANSKRIT_VEDIC_TRAIN = _UD_210_HOME + "UD_Sanskrit-Vedic/sa_vedic-ud-train.conllu"
"UD_210 train set of SANSKRIT_VEDIC."
UD_210_SANSKRIT_VEDIC_TEST = _UD_210_HOME + "UD_Sanskrit-Vedic/sa_vedic-ud-test.conllu"
"UD_210 test set of SANSKRIT_VEDIC."
UD_210_SCOTTISH_GAELIC_ARCOSG_TRAIN = _UD_210_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-train.conllu"
"UD_210 train set of SCOTTISH_GAELIC_ARCOSG."
UD_210_SCOTTISH_GAELIC_ARCOSG_DEV = _UD_210_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-dev.conllu"
"UD_210 dev set of SCOTTISH_GAELIC_ARCOSG."
UD_210_SCOTTISH_GAELIC_ARCOSG_TEST = _UD_210_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-test.conllu"
"UD_210 test set of SCOTTISH_GAELIC_ARCOSG."
UD_210_SERBIAN_SET_TRAIN = _UD_210_HOME + "UD_Serbian-SET/sr_set-ud-train.conllu"
"UD_210 train set of SERBIAN_SET."
UD_210_SERBIAN_SET_DEV = _UD_210_HOME + "UD_Serbian-SET/sr_set-ud-dev.conllu"
"UD_210 dev set of SERBIAN_SET."
UD_210_SERBIAN_SET_TEST = _UD_210_HOME + "UD_Serbian-SET/sr_set-ud-test.conllu"
"UD_210 test set of SERBIAN_SET."
UD_210_SKOLT_SAMI_GIELLAGAS_TEST = _UD_210_HOME + "UD_Skolt_Sami-Giellagas/sms_giellagas-ud-test.conllu"
"UD_210 test set of SKOLT_SAMI_GIELLAGAS."
UD_210_SLOVAK_SNK_TRAIN = _UD_210_HOME + "UD_Slovak-SNK/sk_snk-ud-train.conllu"
"UD_210 train set of SLOVAK_SNK."
UD_210_SLOVAK_SNK_DEV = _UD_210_HOME + "UD_Slovak-SNK/sk_snk-ud-dev.conllu"
"UD_210 dev set of SLOVAK_SNK."
UD_210_SLOVAK_SNK_TEST = _UD_210_HOME + "UD_Slovak-SNK/sk_snk-ud-test.conllu"
"UD_210 test set of SLOVAK_SNK."
UD_210_SLOVENIAN_SSJ_TRAIN = _UD_210_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-train.conllu"
"UD_210 train set of SLOVENIAN_SSJ."
UD_210_SLOVENIAN_SSJ_DEV = _UD_210_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu"
"UD_210 dev set of SLOVENIAN_SSJ."
UD_210_SLOVENIAN_SSJ_TEST = _UD_210_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-test.conllu"
"UD_210 test set of SLOVENIAN_SSJ."
UD_210_SLOVENIAN_SST_TRAIN = _UD_210_HOME + "UD_Slovenian-SST/sl_sst-ud-train.conllu"
"UD_210 train set of SLOVENIAN_SST."
UD_210_SLOVENIAN_SST_TEST = _UD_210_HOME + "UD_Slovenian-SST/sl_sst-ud-test.conllu"
"UD_210 test set of SLOVENIAN_SST."
UD_210_SOI_AHA_TEST = _UD_210_HOME + "UD_Soi-AHA/soj_aha-ud-test.conllu"
"UD_210 test set of SOI_AHA."
UD_210_SOUTH_LEVANTINE_ARABIC_MADAR_TEST = _UD_210_HOME + "UD_South_Levantine_Arabic-MADAR/ajp_madar-ud-test.conllu"
"UD_210 test set of SOUTH_LEVANTINE_ARABIC_MADAR."
UD_210_SPANISH_ANCORA_TRAIN = _UD_210_HOME + "UD_Spanish-AnCora/es_ancora-ud-train.conllu"
"UD_210 train set of SPANISH_ANCORA."
UD_210_SPANISH_ANCORA_DEV = _UD_210_HOME + "UD_Spanish-AnCora/es_ancora-ud-dev.conllu"
"UD_210 dev set of SPANISH_ANCORA."
UD_210_SPANISH_ANCORA_TEST = _UD_210_HOME + "UD_Spanish-AnCora/es_ancora-ud-test.conllu"
"UD_210 test set of SPANISH_ANCORA."
UD_210_SPANISH_GSD_TRAIN = _UD_210_HOME + "UD_Spanish-GSD/es_gsd-ud-train.conllu"
"UD_210 train set of SPANISH_GSD."
UD_210_SPANISH_GSD_DEV = _UD_210_HOME + "UD_Spanish-GSD/es_gsd-ud-dev.conllu"
"UD_210 dev set of SPANISH_GSD."
UD_210_SPANISH_GSD_TEST = _UD_210_HOME + "UD_Spanish-GSD/es_gsd-ud-test.conllu"
"UD_210 test set of SPANISH_GSD."
UD_210_SPANISH_PUD_TEST = _UD_210_HOME + "UD_Spanish-PUD/es_pud-ud-test.conllu"
"UD_210 test set of SPANISH_PUD."
UD_210_SWEDISH_LINES_TRAIN = _UD_210_HOME + "UD_Swedish-LinES/sv_lines-ud-train.conllu"
"UD_210 train set of SWEDISH_LINES."
UD_210_SWEDISH_LINES_DEV = _UD_210_HOME + "UD_Swedish-LinES/sv_lines-ud-dev.conllu"
"UD_210 dev set of SWEDISH_LINES."
UD_210_SWEDISH_LINES_TEST = _UD_210_HOME + "UD_Swedish-LinES/sv_lines-ud-test.conllu"
"UD_210 test set of SWEDISH_LINES."
UD_210_SWEDISH_PUD_TEST = _UD_210_HOME + "UD_Swedish-PUD/sv_pud-ud-test.conllu"
"UD_210 test set of SWEDISH_PUD."
UD_210_SWEDISH_TALBANKEN_TRAIN = _UD_210_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu"
"UD_210 train set of SWEDISH_TALBANKEN."
UD_210_SWEDISH_TALBANKEN_DEV = _UD_210_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu"
"UD_210 dev set of SWEDISH_TALBANKEN."
UD_210_SWEDISH_TALBANKEN_TEST = _UD_210_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-test.conllu"
"UD_210 test set of SWEDISH_TALBANKEN."
UD_210_SWEDISH_SIGN_LANGUAGE_SSLC_TRAIN = _UD_210_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu"
"UD_210 train set of SWEDISH_SIGN_LANGUAGE_SSLC."
UD_210_SWEDISH_SIGN_LANGUAGE_SSLC_DEV = _UD_210_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu"
"UD_210 dev set of SWEDISH_SIGN_LANGUAGE_SSLC."
UD_210_SWEDISH_SIGN_LANGUAGE_SSLC_TEST = _UD_210_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-test.conllu"
"UD_210 test set of SWEDISH_SIGN_LANGUAGE_SSLC."
UD_210_SWISS_GERMAN_UZH_TEST = _UD_210_HOME + "UD_Swiss_German-UZH/gsw_uzh-ud-test.conllu"
"UD_210 test set of SWISS_GERMAN_UZH."
UD_210_TAGALOG_TRG_TEST = _UD_210_HOME + "UD_Tagalog-TRG/tl_trg-ud-test.conllu"
"UD_210 test set of TAGALOG_TRG."
UD_210_TAGALOG_UGNAYAN_TEST = _UD_210_HOME + "UD_Tagalog-Ugnayan/tl_ugnayan-ud-test.conllu"
"UD_210 test set of TAGALOG_UGNAYAN."
UD_210_TAMIL_MWTT_TEST = _UD_210_HOME + "UD_Tamil-MWTT/ta_mwtt-ud-test.conllu"
"UD_210 test set of TAMIL_MWTT."
UD_210_TAMIL_TTB_TRAIN = _UD_210_HOME + "UD_Tamil-TTB/ta_ttb-ud-train.conllu"
"UD_210 train set of TAMIL_TTB."
UD_210_TAMIL_TTB_DEV = _UD_210_HOME + "UD_Tamil-TTB/ta_ttb-ud-dev.conllu"
"UD_210 dev set of TAMIL_TTB."
UD_210_TAMIL_TTB_TEST = _UD_210_HOME + "UD_Tamil-TTB/ta_ttb-ud-test.conllu"
"UD_210 test set of TAMIL_TTB."
UD_210_TATAR_NMCTT_TEST = _UD_210_HOME + "UD_Tatar-NMCTT/tt_nmctt-ud-test.conllu"
"UD_210 test set of TATAR_NMCTT."
UD_210_TEKO_TUDET_TEST = _UD_210_HOME + "UD_Teko-TuDeT/eme_tudet-ud-test.conllu"
"UD_210 test set of TEKO_TUDET."
UD_210_TELUGU_MTG_TRAIN = _UD_210_HOME + "UD_Telugu-MTG/te_mtg-ud-train.conllu"
"UD_210 train set of TELUGU_MTG."
UD_210_TELUGU_MTG_DEV = _UD_210_HOME + "UD_Telugu-MTG/te_mtg-ud-dev.conllu"
"UD_210 dev set of TELUGU_MTG."
UD_210_TELUGU_MTG_TEST = _UD_210_HOME + "UD_Telugu-MTG/te_mtg-ud-test.conllu"
"UD_210 test set of TELUGU_MTG."
UD_210_THAI_PUD_TEST = _UD_210_HOME + "UD_Thai-PUD/th_pud-ud-test.conllu"
"UD_210 test set of THAI_PUD."
UD_210_TUPINAMBA_TUDET_TEST = _UD_210_HOME + "UD_Tupinamba-TuDeT/tpn_tudet-ud-test.conllu"
"UD_210 test set of TUPINAMBA_TUDET."
UD_210_TURKISH_ATIS_TRAIN = _UD_210_HOME + "UD_Turkish-Atis/tr_atis-ud-train.conllu"
"UD_210 train set of TURKISH_ATIS."
UD_210_TURKISH_ATIS_DEV = _UD_210_HOME + "UD_Turkish-Atis/tr_atis-ud-dev.conllu"
"UD_210 dev set of TURKISH_ATIS."
UD_210_TURKISH_ATIS_TEST = _UD_210_HOME + "UD_Turkish-Atis/tr_atis-ud-test.conllu"
"UD_210 test set of TURKISH_ATIS."
UD_210_TURKISH_BOUN_TRAIN = _UD_210_HOME + "UD_Turkish-BOUN/tr_boun-ud-train.conllu"
"UD_210 train set of TURKISH_BOUN."
UD_210_TURKISH_BOUN_DEV = _UD_210_HOME + "UD_Turkish-BOUN/tr_boun-ud-dev.conllu"
"UD_210 dev set of TURKISH_BOUN."
UD_210_TURKISH_BOUN_TEST = _UD_210_HOME + "UD_Turkish-BOUN/tr_boun-ud-test.conllu"
"UD_210 test set of TURKISH_BOUN."
UD_210_TURKISH_FRAMENET_TRAIN = _UD_210_HOME + "UD_Turkish-FrameNet/tr_framenet-ud-train.conllu"
"UD_210 train set of TURKISH_FRAMENET."
UD_210_TURKISH_FRAMENET_DEV = _UD_210_HOME + "UD_Turkish-FrameNet/tr_framenet-ud-dev.conllu"
"UD_210 dev set of TURKISH_FRAMENET."
UD_210_TURKISH_FRAMENET_TEST = _UD_210_HOME + "UD_Turkish-FrameNet/tr_framenet-ud-test.conllu"
"UD_210 test set of TURKISH_FRAMENET."
UD_210_TURKISH_GB_TEST = _UD_210_HOME + "UD_Turkish-GB/tr_gb-ud-test.conllu"
"UD_210 test set of TURKISH_GB."
UD_210_TURKISH_IMST_TRAIN = _UD_210_HOME + "UD_Turkish-IMST/tr_imst-ud-train.conllu"
"UD_210 train set of TURKISH_IMST."
UD_210_TURKISH_IMST_DEV = _UD_210_HOME + "UD_Turkish-IMST/tr_imst-ud-dev.conllu"
"UD_210 dev set of TURKISH_IMST."
UD_210_TURKISH_IMST_TEST = _UD_210_HOME + "UD_Turkish-IMST/tr_imst-ud-test.conllu"
"UD_210 test set of TURKISH_IMST."
UD_210_TURKISH_KENET_TRAIN = _UD_210_HOME + "UD_Turkish-Kenet/tr_kenet-ud-train.conllu"
"UD_210 train set of TURKISH_KENET."
UD_210_TURKISH_KENET_DEV = _UD_210_HOME + "UD_Turkish-Kenet/tr_kenet-ud-dev.conllu"
"UD_210 dev set of TURKISH_KENET."
UD_210_TURKISH_KENET_TEST = _UD_210_HOME + "UD_Turkish-Kenet/tr_kenet-ud-test.conllu"
"UD_210 test set of TURKISH_KENET."
UD_210_TURKISH_PUD_TEST = _UD_210_HOME + "UD_Turkish-PUD/tr_pud-ud-test.conllu"
"UD_210 test set of TURKISH_PUD."
UD_210_TURKISH_PENN_TRAIN = _UD_210_HOME + "UD_Turkish-Penn/tr_penn-ud-train.conllu"
"UD_210 train set of TURKISH_PENN."
UD_210_TURKISH_PENN_DEV = _UD_210_HOME + "UD_Turkish-Penn/tr_penn-ud-dev.conllu"
"UD_210 dev set of TURKISH_PENN."
UD_210_TURKISH_PENN_TEST = _UD_210_HOME + "UD_Turkish-Penn/tr_penn-ud-test.conllu"
"UD_210 test set of TURKISH_PENN."
UD_210_TURKISH_TOURISM_TRAIN = _UD_210_HOME + "UD_Turkish-Tourism/tr_tourism-ud-train.conllu"
"UD_210 train set of TURKISH_TOURISM."
UD_210_TURKISH_TOURISM_DEV = _UD_210_HOME + "UD_Turkish-Tourism/tr_tourism-ud-dev.conllu"
"UD_210 dev set of TURKISH_TOURISM."
UD_210_TURKISH_TOURISM_TEST = _UD_210_HOME + "UD_Turkish-Tourism/tr_tourism-ud-test.conllu"
"UD_210 test set of TURKISH_TOURISM."
UD_210_TURKISH_GERMAN_SAGT_TRAIN = _UD_210_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-train.conllu"
"UD_210 train set of TURKISH_GERMAN_SAGT."
UD_210_TURKISH_GERMAN_SAGT_DEV = _UD_210_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-dev.conllu"
"UD_210 dev set of TURKISH_GERMAN_SAGT."
UD_210_TURKISH_GERMAN_SAGT_TEST = _UD_210_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-test.conllu"
"UD_210 test set of TURKISH_GERMAN_SAGT."
UD_210_UKRAINIAN_IU_TRAIN = _UD_210_HOME + "UD_Ukrainian-IU/uk_iu-ud-train.conllu"
"UD_210 train set of UKRAINIAN_IU."
UD_210_UKRAINIAN_IU_DEV = _UD_210_HOME + "UD_Ukrainian-IU/uk_iu-ud-dev.conllu"
"UD_210 dev set of UKRAINIAN_IU."
UD_210_UKRAINIAN_IU_TEST = _UD_210_HOME + "UD_Ukrainian-IU/uk_iu-ud-test.conllu"
"UD_210 test set of UKRAINIAN_IU."
UD_210_UMBRIAN_IKUVINA_TEST = _UD_210_HOME + "UD_Umbrian-IKUVINA/xum_ikuvina-ud-test.conllu"
"UD_210 test set of UMBRIAN_IKUVINA."
UD_210_UPPER_SORBIAN_UFAL_TRAIN = _UD_210_HOME + "UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu"
"UD_210 train set of UPPER_SORBIAN_UFAL."
UD_210_UPPER_SORBIAN_UFAL_TEST = _UD_210_HOME + "UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu"
"UD_210 test set of UPPER_SORBIAN_UFAL."
UD_210_URDU_UDTB_TRAIN = _UD_210_HOME + "UD_Urdu-UDTB/ur_udtb-ud-train.conllu"
"UD_210 train set of URDU_UDTB."
UD_210_URDU_UDTB_DEV = _UD_210_HOME + "UD_Urdu-UDTB/ur_udtb-ud-dev.conllu"
"UD_210 dev set of URDU_UDTB."
UD_210_URDU_UDTB_TEST = _UD_210_HOME + "UD_Urdu-UDTB/ur_udtb-ud-test.conllu"
"UD_210 test set of URDU_UDTB."
UD_210_UYGHUR_UDT_TRAIN = _UD_210_HOME + "UD_Uyghur-UDT/ug_udt-ud-train.conllu"
"UD_210 train set of UYGHUR_UDT."
UD_210_UYGHUR_UDT_DEV = _UD_210_HOME + "UD_Uyghur-UDT/ug_udt-ud-dev.conllu"
"UD_210 dev set of UYGHUR_UDT."
UD_210_UYGHUR_UDT_TEST = _UD_210_HOME + "UD_Uyghur-UDT/ug_udt-ud-test.conllu"
"UD_210 test set of UYGHUR_UDT."
UD_210_VIETNAMESE_VTB_TRAIN = _UD_210_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-train.conllu"
"UD_210 train set of VIETNAMESE_VTB."
UD_210_VIETNAMESE_VTB_DEV = _UD_210_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu"
"UD_210 dev set of VIETNAMESE_VTB."
UD_210_VIETNAMESE_VTB_TEST = _UD_210_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-test.conllu"
"UD_210 test set of VIETNAMESE_VTB."
UD_210_WARLPIRI_UFAL_TEST = _UD_210_HOME + "UD_Warlpiri-UFAL/wbp_ufal-ud-test.conllu"
"UD_210 test set of WARLPIRI_UFAL."
UD_210_WELSH_CCG_TRAIN = _UD_210_HOME + "UD_Welsh-CCG/cy_ccg-ud-train.conllu"
"UD_210 train set of WELSH_CCG."
UD_210_WELSH_CCG_DEV = _UD_210_HOME + "UD_Welsh-CCG/cy_ccg-ud-dev.conllu"
"UD_210 dev set of WELSH_CCG."
UD_210_WELSH_CCG_TEST = _UD_210_HOME + "UD_Welsh-CCG/cy_ccg-ud-test.conllu"
"UD_210 test set of WELSH_CCG."
UD_210_WESTERN_ARMENIAN_ARMTDP_TRAIN = _UD_210_HOME + "UD_Western_Armenian-ArmTDP/hyw_armtdp-ud-train.conllu"
"UD_210 train set of WESTERN_ARMENIAN_ARMTDP."
UD_210_WESTERN_ARMENIAN_ARMTDP_DEV = _UD_210_HOME + "UD_Western_Armenian-ArmTDP/hyw_armtdp-ud-dev.conllu"
"UD_210 dev set of WESTERN_ARMENIAN_ARMTDP."
UD_210_WESTERN_ARMENIAN_ARMTDP_TEST = _UD_210_HOME + "UD_Western_Armenian-ArmTDP/hyw_armtdp-ud-test.conllu"
"UD_210 test set of WESTERN_ARMENIAN_ARMTDP."
UD_210_WOLOF_WTB_TRAIN = _UD_210_HOME + "UD_Wolof-WTB/wo_wtb-ud-train.conllu"
"UD_210 train set of WOLOF_WTB."
UD_210_WOLOF_WTB_DEV = _UD_210_HOME + "UD_Wolof-WTB/wo_wtb-ud-dev.conllu"
"UD_210 dev set of WOLOF_WTB."
UD_210_WOLOF_WTB_TEST = _UD_210_HOME + "UD_Wolof-WTB/wo_wtb-ud-test.conllu"
"UD_210 test set of WOLOF_WTB."
UD_210_XIBE_XDT_TEST = _UD_210_HOME + "UD_Xibe-XDT/sjo_xdt-ud-test.conllu"
"UD_210 test set of XIBE_XDT."
UD_210_YAKUT_YKTDT_TEST = _UD_210_HOME + "UD_Yakut-YKTDT/sah_yktdt-ud-test.conllu"
"UD_210 test set of YAKUT_YKTDT."
UD_210_YORUBA_YTB_TEST = _UD_210_HOME + "UD_Yoruba-YTB/yo_ytb-ud-test.conllu"
"UD_210 test set of YORUBA_YTB."
UD_210_YUPIK_SLI_TEST = _UD_210_HOME + "UD_Yupik-SLI/ess_sli-ud-test.conllu"
"UD_210 test set of YUPIK_SLI."


================================================
FILE: hanlp/datasets/parsing/ud/ud210m.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-21 20:39
import os

from hanlp.datasets.parsing.ud import concat_treebanks
from hanlp.datasets.parsing.ud.ud210 import _UD_210_HOME

_UD_210_MULTILINGUAL_HOME = concat_treebanks(_UD_210_HOME, '2.10')
UD_210_MULTILINGUAL_TRAIN = os.path.join(_UD_210_MULTILINGUAL_HOME, 'train.conllu')
"Training set of multilingual UD_210 obtained by concatenating all training sets."
UD_210_MULTILINGUAL_DEV = os.path.join(_UD_210_MULTILINGUAL_HOME, 'dev.conllu')
"Dev set of multilingual UD_210 obtained by concatenating all dev sets."
UD_210_MULTILINGUAL_TEST = os.path.join(_UD_210_MULTILINGUAL_HOME, 'test.conllu')
"Test set of multilingual UD_210 obtained by concatenating all test sets."


================================================
FILE: hanlp/datasets/parsing/ud/ud23.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-21 20:26

_UD_23_HOME = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2895/ud-treebanks-v2.3.tgz?sequence=1&isAllowed=y"
_UD_24_HOME = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2988/ud-treebanks-v2.4.tgz?sequence=4&isAllowed=y"


def _list_dir(path, home):
    prefix = home.lstrip('_').replace('_HOME', '')

    from hanlp.utils.io_util import get_resource
    import glob
    import os
    path = get_resource(path)
    with open('ud23.py', 'a') as out:
        for f in sorted(glob.glob(path + '/UD_*')):
            basename = os.path.basename(f)
            name = basename[len('UD_'):]
            name = name.upper().replace('-', '_')
            for split in 'train', 'dev', 'test':
                sp = glob.glob(f + f'/*{split}.conllu')
                if not sp:
                    continue
                sp = os.path.basename(sp[0])
                out.write(f'{prefix}_{name}_{split.upper()} = {home} + "#{basename}/{sp}"\n')


def main():
    _list_dir(_UD_23_HOME, '_UD_23_HOME')
    pass


if __name__ == '__main__':
    main()

UD_23_AFRIKAANS_AFRIBOOMS_TRAIN = _UD_23_HOME + "#UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu"
UD_23_AFRIKAANS_AFRIBOOMS_DEV = _UD_23_HOME + "#UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu"
UD_23_AFRIKAANS_AFRIBOOMS_TEST = _UD_23_HOME + "#UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu"
UD_23_AKKADIAN_PISANDUB_TEST = _UD_23_HOME + "#UD_Akkadian-PISANDUB/akk_pisandub-ud-test.conllu"
UD_23_AMHARIC_ATT_TEST = _UD_23_HOME + "#UD_Amharic-ATT/am_att-ud-test.conllu"
UD_23_ANCIENT_GREEK_PROIEL_TRAIN = _UD_23_HOME + "#UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu"
UD_23_ANCIENT_GREEK_PROIEL_DEV = _UD_23_HOME + "#UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu"
UD_23_ANCIENT_GREEK_PROIEL_TEST = _UD_23_HOME + "#UD_Ancient_Greek-PROIEL/grc_proiel-ud-test.conllu"
UD_23_ANCIENT_GREEK_PERSEUS_TRAIN = _UD_23_HOME + "#UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu"
UD_23_ANCIENT_GREEK_PERSEUS_DEV = _UD_23_HOME + "#UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu"
UD_23_ANCIENT_GREEK_PERSEUS_TEST = _UD_23_HOME + "#UD_Ancient_Greek-Perseus/grc_perseus-ud-test.conllu"
UD_23_ARABIC_NYUAD_TRAIN = _UD_23_HOME + "#UD_Arabic-NYUAD/ar_nyuad-ud-train.conllu"
UD_23_ARABIC_NYUAD_DEV = _UD_23_HOME + "#UD_Arabic-NYUAD/ar_nyuad-ud-dev.conllu"
UD_23_ARABIC_NYUAD_TEST = _UD_23_HOME + "#UD_Arabic-NYUAD/ar_nyuad-ud-test.conllu"
UD_23_ARABIC_PADT_TRAIN = _UD_23_HOME + "#UD_Arabic-PADT/ar_padt-ud-train.conllu"
UD_23_ARABIC_PADT_DEV = _UD_23_HOME + "#UD_Arabic-PADT/ar_padt-ud-dev.conllu"
UD_23_ARABIC_PADT_TEST = _UD_23_HOME + "#UD_Arabic-PADT/ar_padt-ud-test.conllu"
UD_23_ARABIC_PUD_TEST = _UD_23_HOME + "#UD_Arabic-PUD/ar_pud-ud-test.conllu"
UD_23_ARMENIAN_ARMTDP_TRAIN = _UD_23_HOME + "#UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu"
UD_23_ARMENIAN_ARMTDP_TEST = _UD_23_HOME + "#UD_Armenian-ArmTDP/hy_armtdp-ud-test.conllu"
UD_23_BAMBARA_CRB_TEST = _UD_23_HOME + "#UD_Bambara-CRB/bm_crb-ud-test.conllu"
UD_23_BASQUE_BDT_TRAIN = _UD_23_HOME + "#UD_Basque-BDT/eu_bdt-ud-train.conllu"
UD_23_BASQUE_BDT_DEV = _UD_23_HOME + "#UD_Basque-BDT/eu_bdt-ud-dev.conllu"
UD_23_BASQUE_BDT_TEST = _UD_23_HOME + "#UD_Basque-BDT/eu_bdt-ud-test.conllu"
UD_23_BELARUSIAN_HSE_TRAIN = _UD_23_HOME + "#UD_Belarusian-HSE/be_hse-ud-train.conllu"
UD_23_BELARUSIAN_HSE_DEV = _UD_23_HOME + "#UD_Belarusian-HSE/be_hse-ud-dev.conllu"
UD_23_BELARUSIAN_HSE_TEST = _UD_23_HOME + "#UD_Belarusian-HSE/be_hse-ud-test.conllu"
UD_23_BRETON_KEB_TEST = _UD_23_HOME + "#UD_Breton-KEB/br_keb-ud-test.conllu"
UD_23_BULGARIAN_BTB_TRAIN = _UD_23_HOME + "#UD_Bulgarian-BTB/bg_btb-ud-train.conllu"
UD_23_BULGARIAN_BTB_DEV = _UD_23_HOME + "#UD_Bulgarian-BTB/bg_btb-ud-dev.conllu"
UD_23_BULGARIAN_BTB_TEST = _UD_23_HOME + "#UD_Bulgarian-BTB/bg_btb-ud-test.conllu"
UD_23_BURYAT_BDT_TRAIN = _UD_23_HOME + "#UD_Buryat-BDT/bxr_bdt-ud-train.conllu"
UD_23_BURYAT_BDT_TEST = _UD_23_HOME + "#UD_Buryat-BDT/bxr_bdt-ud-test.conllu"
UD_23_CANTONESE_HK_TEST = _UD_23_HOME + "#UD_Cantonese-HK/yue_hk-ud-test.conllu"
UD_23_CATALAN_ANCORA_TRAIN = _UD_23_HOME + "#UD_Catalan-AnCora/ca_ancora-ud-train.conllu"
UD_23_CATALAN_ANCORA_DEV = _UD_23_HOME + "#UD_Catalan-AnCora/ca_ancora-ud-dev.conllu"
UD_23_CATALAN_ANCORA_TEST = _UD_23_HOME + "#UD_Catalan-AnCora/ca_ancora-ud-test.conllu"
UD_23_CHINESE_CFL_TEST = _UD_23_HOME + "#UD_Chinese-CFL/zh_cfl-ud-test.conllu"
UD_23_CHINESE_GSD_TRAIN = _UD_23_HOME + "#UD_Chinese-GSD/zh_gsd-ud-train.conllu"
UD_23_CHINESE_GSD_DEV = _UD_23_HOME + "#UD_Chinese-GSD/zh_gsd-ud-dev.conllu"
UD_23_CHINESE_GSD_TEST = _UD_23_HOME + "#UD_Chinese-GSD/zh_gsd-ud-test.conllu"
UD_23_CHINESE_HK_TEST = _UD_23_HOME + "#UD_Chinese-HK/zh_hk-ud-test.conllu"
UD_23_CHINESE_PUD_TEST = _UD_23_HOME + "#UD_Chinese-PUD/zh_pud-ud-test.conllu"
UD_23_COPTIC_SCRIPTORIUM_TRAIN = _UD_23_HOME + "#UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu"
UD_23_COPTIC_SCRIPTORIUM_DEV = _UD_23_HOME + "#UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu"
UD_23_COPTIC_SCRIPTORIUM_TEST = _UD_23_HOME + "#UD_Coptic-Scriptorium/cop_scriptorium-ud-test.conllu"
UD_23_CROATIAN_SET_TRAIN = _UD_23_HOME + "#UD_Croatian-SET/hr_set-ud-train.conllu"
UD_23_CROATIAN_SET_DEV = _UD_23_HOME + "#UD_Croatian-SET/hr_set-ud-dev.conllu"
UD_23_CROATIAN_SET_TEST = _UD_23_HOME + "#UD_Croatian-SET/hr_set-ud-test.conllu"
UD_23_CZECH_CAC_TRAIN = _UD_23_HOME + "#UD_Czech-CAC/cs_cac-ud-train.conllu"
UD_23_CZECH_CAC_DEV = _UD_23_HOME + "#UD_Czech-CAC/cs_cac-ud-dev.conllu"
UD_23_CZECH_CAC_TEST = _UD_23_HOME + "#UD_Czech-CAC/cs_cac-ud-test.conllu"
UD_23_CZECH_CLTT_TRAIN = _UD_23_HOME + "#UD_Czech-CLTT/cs_cltt-ud-train.conllu"
UD_23_CZECH_CLTT_DEV = _UD_23_HOME + "#UD_Czech-CLTT/cs_cltt-ud-dev.conllu"
UD_23_CZECH_CLTT_TEST = _UD_23_HOME + "#UD_Czech-CLTT/cs_cltt-ud-test.conllu"
UD_23_CZECH_FICTREE_TRAIN = _UD_23_HOME + "#UD_Czech-FicTree/cs_fictree-ud-train.conllu"
UD_23_CZECH_FICTREE_DEV = _UD_23_HOME + "#UD_Czech-FicTree/cs_fictree-ud-dev.conllu"
UD_23_CZECH_FICTREE_TEST = _UD_23_HOME + "#UD_Czech-FicTree/cs_fictree-ud-test.conllu"
UD_23_CZECH_PDT_TRAIN = _UD_23_HOME + "#UD_Czech-PDT/cs_pdt-ud-train.conllu"
UD_23_CZECH_PDT_DEV = _UD_23_HOME + "#UD_Czech-PDT/cs_pdt-ud-dev.conllu"
UD_23_CZECH_PDT_TEST = _UD_23_HOME + "#UD_Czech-PDT/cs_pdt-ud-test.conllu"
UD_23_CZECH_PUD_TEST = _UD_23_HOME + "#UD_Czech-PUD/cs_pud-ud-test.conllu"
UD_23_DANISH_DDT_TRAIN = _UD_23_HOME + "#UD_Danish-DDT/da_ddt-ud-train.conllu"
UD_23_DANISH_DDT_DEV = _UD_23_HOME + "#UD_Danish-DDT/da_ddt-ud-dev.conllu"
UD_23_DANISH_DDT_TEST = _UD_23_HOME + "#UD_Danish-DDT/da_ddt-ud-test.conllu"
UD_23_DUTCH_ALPINO_TRAIN = _UD_23_HOME + "#UD_Dutch-Alpino/nl_alpino-ud-train.conllu"
UD_23_DUTCH_ALPINO_DEV = _UD_23_HOME + "#UD_Dutch-Alpino/nl_alpino-ud-dev.conllu"
UD_23_DUTCH_ALPINO_TEST = _UD_23_HOME + "#UD_Dutch-Alpino/nl_alpino-ud-test.conllu"
UD_23_DUTCH_LASSYSMALL_TRAIN = _UD_23_HOME + "#UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu"
UD_23_DUTCH_LASSYSMALL_DEV = _UD_23_HOME + "#UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu"
UD_23_DUTCH_LASSYSMALL_TEST = _UD_23_HOME + "#UD_Dutch-LassySmall/nl_lassysmall-ud-test.conllu"
UD_23_ENGLISH_ESL_TRAIN = _UD_23_HOME + "#UD_English-ESL/en_esl-ud-train.conllu"
UD_23_ENGLISH_ESL_DEV = _UD_23_HOME + "#UD_English-ESL/en_esl-ud-dev.conllu"
UD_23_ENGLISH_ESL_TEST = _UD_23_HOME + "#UD_English-ESL/en_esl-ud-test.conllu"
UD_23_ENGLISH_EWT_TRAIN = _UD_23_HOME + "#UD_English-EWT/en_ewt-ud-train.conllu"
UD_23_ENGLISH_EWT_DEV = _UD_23_HOME + "#UD_English-EWT/en_ewt-ud-dev.conllu"
UD_23_ENGLISH_EWT_TEST = _UD_23_HOME + "#UD_English-EWT/en_ewt-ud-test.conllu"
UD_23_ENGLISH_GUM_TRAIN = _UD_23_HOME + "#UD_English-GUM/en_gum-ud-train.conllu"
UD_23_ENGLISH_GUM_DEV = _UD_23_HOME + "#UD_English-GUM/en_gum-ud-dev.conllu"
UD_23_ENGLISH_GUM_TEST = _UD_23_HOME + "#UD_English-GUM/en_gum-ud-test.conllu"
UD_23_ENGLISH_LINES_TRAIN = _UD_23_HOME + "#UD_English-LinES/en_lines-ud-train.conllu"
UD_23_ENGLISH_LINES_DEV = _UD_23_HOME + "#UD_English-LinES/en_lines-ud-dev.conllu"
UD_23_ENGLISH_LINES_TEST = _UD_23_HOME + "#UD_English-LinES/en_lines-ud-test.conllu"
UD_23_ENGLISH_PUD_TEST = _UD_23_HOME + "#UD_English-PUD/en_pud-ud-test.conllu"
UD_23_ENGLISH_PARTUT_TRAIN = _UD_23_HOME + "#UD_English-ParTUT/en_partut-ud-train.conllu"
UD_23_ENGLISH_PARTUT_DEV = _UD_23_HOME + "#UD_English-ParTUT/en_partut-ud-dev.conllu"
UD_23_ENGLISH_PARTUT_TEST = _UD_23_HOME + "#UD_English-ParTUT/en_partut-ud-test.conllu"
UD_23_ERZYA_JR_TEST = _UD_23_HOME + "#UD_Erzya-JR/myv_jr-ud-test.conllu"
UD_23_ESTONIAN_EDT_TRAIN = _UD_23_HOME + "#UD_Estonian-EDT/et_edt-ud-train.conllu"
UD_23_ESTONIAN_EDT_DEV = _UD_23_HOME + "#UD_Estonian-EDT/et_edt-ud-dev.conllu"
UD_23_ESTONIAN_EDT_TEST = _UD_23_HOME + "#UD_Estonian-EDT/et_edt-ud-test.conllu"
UD_23_FAROESE_OFT_TEST = _UD_23_HOME + "#UD_Faroese-OFT/fo_oft-ud-test.conllu"
UD_23_FINNISH_FTB_TRAIN = _UD_23_HOME + "#UD_Finnish-FTB/fi_ftb-ud-train.conllu"
UD_23_FINNISH_FTB_DEV = _UD_23_HOME + "#UD_Finnish-FTB/fi_ftb-ud-dev.conllu"
UD_23_FINNISH_FTB_TEST = _UD_23_HOME + "#UD_Finnish-FTB/fi_ftb-ud-test.conllu"
UD_23_FINNISH_PUD_TEST = _UD_23_HOME + "#UD_Finnish-PUD/fi_pud-ud-test.conllu"
UD_23_FINNISH_TDT_TRAIN = _UD_23_HOME + "#UD_Finnish-TDT/fi_tdt-ud-train.conllu"
UD_23_FINNISH_TDT_DEV = _UD_23_HOME + "#UD_Finnish-TDT/fi_tdt-ud-dev.conllu"
UD_23_FINNISH_TDT_TEST = _UD_23_HOME + "#UD_Finnish-TDT/fi_tdt-ud-test.conllu"
UD_23_FRENCH_FTB_TRAIN = _UD_23_HOME + "#UD_French-FTB/fr_ftb-ud-train.conllu"
UD_23_FRENCH_FTB_DEV = _UD_23_HOME + "#UD_French-FTB/fr_ftb-ud-dev.conllu"
UD_23_FRENCH_FTB_TEST = _UD_23_HOME + "#UD_French-FTB/fr_ftb-ud-test.conllu"
UD_23_FRENCH_GSD_TRAIN = _UD_23_HOME + "#UD_French-GSD/fr_gsd-ud-train.conllu"
UD_23_FRENCH_GSD_DEV = _UD_23_HOME + "#UD_French-GSD/fr_gsd-ud-dev.conllu"
UD_23_FRENCH_GSD_TEST = _UD_23_HOME + "#UD_French-GSD/fr_gsd-ud-test.conllu"
UD_23_FRENCH_PUD_TEST = _UD_23_HOME + "#UD_French-PUD/fr_pud-ud-test.conllu"
UD_23_FRENCH_PARTUT_TRAIN = _UD_23_HOME + "#UD_French-ParTUT/fr_partut-ud-train.conllu"
UD_23_FRENCH_PARTUT_DEV = _UD_23_HOME + "#UD_French-ParTUT/fr_partut-ud-dev.conllu"
UD_23_FRENCH_PARTUT_TEST = _UD_23_HOME + "#UD_French-ParTUT/fr_partut-ud-test.conllu"
UD_23_FRENCH_SEQUOIA_TRAIN = _UD_23_HOME + "#UD_French-Sequoia/fr_sequoia-ud-train.conllu"
UD_23_FRENCH_SEQUOIA_DEV = _UD_23_HOME + "#UD_French-Sequoia/fr_sequoia-ud-dev.conllu"
UD_23_FRENCH_SEQUOIA_TEST = _UD_23_HOME + "#UD_French-Sequoia/fr_sequoia-ud-test.conllu"
UD_23_FRENCH_SPOKEN_TRAIN = _UD_23_HOME + "#UD_French-Spoken/fr_spoken-ud-train.conllu"
UD_23_FRENCH_SPOKEN_DEV = _UD_23_HOME + "#UD_French-Spoken/fr_spoken-ud-dev.conllu"
UD_23_FRENCH_SPOKEN_TEST = _UD_23_HOME + "#UD_French-Spoken/fr_spoken-ud-test.conllu"
UD_23_GALICIAN_CTG_TRAIN = _UD_23_HOME + "#UD_Galician-CTG/gl_ctg-ud-train.conllu"
UD_23_GALICIAN_CTG_DEV = _UD_23_HOME + "#UD_Galician-CTG/gl_ctg-ud-dev.conllu"
UD_23_GALICIAN_CTG_TEST = _UD_23_HOME + "#UD_Galician-CTG/gl_ctg-ud-test.conllu"
UD_23_GALICIAN_TREEGAL_TRAIN = _UD_23_HOME + "#UD_Galician-TreeGal/gl_treegal-ud-train.conllu"
UD_23_GALICIAN_TREEGAL_TEST = _UD_23_HOME + "#UD_Galician-TreeGal/gl_treegal-ud-test.conllu"
UD_23_GERMAN_GSD_TRAIN = _UD_23_HOME + "#UD_German-GSD/de_gsd-ud-train.conllu"
UD_23_GERMAN_GSD_DEV = _UD_23_HOME + "#UD_German-GSD/de_gsd-ud-dev.conllu"
UD_23_GERMAN_GSD_TEST = _UD_23_HOME + "#UD_German-GSD/de_gsd-ud-test.conllu"
UD_23_GERMAN_PUD_TEST = _UD_23_HOME + "#UD_German-PUD/de_pud-ud-test.conllu"
UD_23_GOTHIC_PROIEL_TRAIN = _UD_23_HOME + "#UD_Gothic-PROIEL/got_proiel-ud-train.conllu"
UD_23_GOTHIC_PROIEL_DEV = _UD_23_HOME + "#UD_Gothic-PROIEL/got_proiel-ud-dev.conllu"
UD_23_GOTHIC_PROIEL_TEST = _UD_23_HOME + "#UD_Gothic-PROIEL/got_proiel-ud-test.conllu"
UD_23_GREEK_GDT_TRAIN = _UD_23_HOME + "#UD_Greek-GDT/el_gdt-ud-train.conllu"
UD_23_GREEK_GDT_DEV = _UD_23_HOME + "#UD_Greek-GDT/el_gdt-ud-dev.conllu"
UD_23_GREEK_GDT_TEST = _UD_23_HOME + "#UD_Greek-GDT/el_gdt-ud-test.conllu"
UD_23_HEBREW_HTB_TRAIN = _UD_23_HOME + "#UD_Hebrew-HTB/he_htb-ud-train.conllu"
UD_23_HEBREW_HTB_DEV = _UD_23_HOME + "#UD_Hebrew-HTB/he_htb-ud-dev.conllu"
UD_23_HEBREW_HTB_TEST = _UD_23_HOME + "#UD_Hebrew-HTB/he_htb-ud-test.conllu"
UD_23_HINDI_HDTB_TRAIN = _UD_23_HOME + "#UD_Hindi-HDTB/hi_hdtb-ud-train.conllu"
UD_23_HINDI_HDTB_DEV = _UD_23_HOME + "#UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu"
UD_23_HINDI_HDTB_TEST = _UD_23_HOME + "#UD_Hindi-HDTB/hi_hdtb-ud-test.conllu"
UD_23_HINDI_PUD_TEST = _UD_23_HOME + "#UD_Hindi-PUD/hi_pud-ud-test.conllu"
UD_23_HINDI_ENGLISH_HIENCS_TRAIN = _UD_23_HOME + "#UD_Hindi_English-HIENCS/qhe_hiencs-ud-train.conllu"
UD_23_HINDI_ENGLISH_HIENCS_DEV = _UD_23_HOME + "#UD_Hindi_English-HIENCS/qhe_hiencs-ud-dev.conllu"
UD_23_HINDI_ENGLISH_HIENCS_TEST = _UD_23_HOME + "#UD_Hindi_English-HIENCS/qhe_hiencs-ud-test.conllu"
UD_23_HUNGARIAN_SZEGED_TRAIN = _UD_23_HOME + "#UD_Hungarian-Szeged/hu_szeged-ud-train.conllu"
UD_23_HUNGARIAN_SZEGED_DEV = _UD_23_HOME + "#UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu"
UD_23_HUNGARIAN_SZEGED_TEST = _UD_23_HOME + "#UD_Hungarian-Szeged/hu_szeged-ud-test.conllu"
UD_23_INDONESIAN_GSD_TRAIN = _UD_23_HOME + "#UD_Indonesian-GSD/id_gsd-ud-train.conllu"
UD_23_INDONESIAN_GSD_DEV = _UD_23_HOME + "#UD_Indonesian-GSD/id_gsd-ud-dev.conllu"
UD_23_INDONESIAN_GSD_TEST = _UD_23_HOME + "#UD_Indonesian-GSD/id_gsd-ud-test.conllu"
UD_23_INDONESIAN_PUD_TEST = _UD_23_HOME + "#UD_Indonesian-PUD/id_pud-ud-test.conllu"
UD_23_IRISH_IDT_TRAIN = _UD_23_HOME + "#UD_Irish-IDT/ga_idt-ud-train.conllu"
UD_23_IRISH_IDT_TEST = _UD_23_HOME + "#UD_Irish-IDT/ga_idt-ud-test.conllu"
UD_23_ITALIAN_ISDT_TRAIN = _UD_23_HOME + "#UD_Italian-ISDT/it_isdt-ud-train.conllu"
UD_23_ITALIAN_ISDT_DEV = _UD_23_HOME + "#UD_Italian-ISDT/it_isdt-ud-dev.conllu"
UD_23_ITALIAN_ISDT_TEST = _UD_23_HOME + "#UD_Italian-ISDT/it_isdt-ud-test.conllu"
UD_23_ITALIAN_PUD_TEST = _UD_23_HOME + "#UD_Italian-PUD/it_pud-ud-test.conllu"
UD_23_ITALIAN_PARTUT_TRAIN = _UD_23_HOME + "#UD_Italian-ParTUT/it_partut-ud-train.conllu"
UD_23_ITALIAN_PARTUT_DEV = _UD_23_HOME + "#UD_Italian-ParTUT/it_partut-ud-dev.conllu"
UD_23_ITALIAN_PARTUT_TEST = _UD_23_HOME + "#UD_Italian-ParTUT/it_partut-ud-test.conllu"
UD_23_ITALIAN_POSTWITA_TRAIN = _UD_23_HOME + "#UD_Italian-PoSTWITA/it_postwita-ud-train.conllu"
UD_23_ITALIAN_POSTWITA_DEV = _UD_23_HOME + "#UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu"
UD_23_ITALIAN_POSTWITA_TEST = _UD_23_HOME + "#UD_Italian-PoSTWITA/it_postwita-ud-test.conllu"
UD_23_JAPANESE_BCCWJ_TRAIN = _UD_23_HOME + "#UD_Japanese-BCCWJ/ja_bccwj-ud-train.conllu"
UD_23_JAPANESE_BCCWJ_DEV = _UD_23_HOME + "#UD_Japanese-BCCWJ/ja_bccwj-ud-dev.conllu"
UD_23_JAPANESE_BCCWJ_TEST = _UD_23_HOME + "#UD_Japanese-BCCWJ/ja_bccwj-ud-test.conllu"
UD_23_JAPANESE_GSD_TRAIN = _UD_23_HOME + "#UD_Japanese-GSD/ja_gsd-ud-train.conllu"
UD_23_JAPANESE_GSD_DEV = _UD_23_HOME + "#UD_Japanese-GSD/ja_gsd-ud-dev.conllu"
UD_23_JAPANESE_GSD_TEST = _UD_23_HOME + "#UD_Japanese-GSD/ja_gsd-ud-test.conllu"
UD_23_JAPANESE_MODERN_TEST = _UD_23_HOME + "#UD_Japanese-Modern/ja_modern-ud-test.conllu"
UD_23_JAPANESE_PUD_TEST = _UD_23_HOME + "#UD_Japanese-PUD/ja_pud-ud-test.conllu"
UD_23_KAZAKH_KTB_TRAIN = _UD_23_HOME + "#UD_Kazakh-KTB/kk_ktb-ud-train.conllu"
UD_23_KAZAKH_KTB_TEST = _UD_23_HOME + "#UD_Kazakh-KTB/kk_ktb-ud-test.conllu"
UD_23_KOMI_ZYRIAN_IKDP_TEST = _UD_23_HOME + "#UD_Komi_Zyrian-IKDP/kpv_ikdp-ud-test.conllu"
UD_23_KOMI_ZYRIAN_LATTICE_TEST = _UD_23_HOME + "#UD_Komi_Zyrian-Lattice/kpv_lattice-ud-test.conllu"
UD_23_KOREAN_GSD_TRAIN = _UD_23_HOME + "#UD_Korean-GSD/ko_gsd-ud-train.conllu"
UD_23_KOREAN_GSD_DEV = _UD_23_HOME + "#UD_Korean-GSD/ko_gsd-ud-dev.conllu"
UD_23_KOREAN_GSD_TEST = _UD_23_HOME + "#UD_Korean-GSD/ko_gsd-ud-test.conllu"
UD_23_KOREAN_KAIST_TRAIN = _UD_23_HOME + "#UD_Korean-Kaist/ko_kaist-ud-train.conllu"
UD_23_KOREAN_KAIST_DEV = _UD_23_HOME + "#UD_Korean-Kaist/ko_kaist-ud-dev.conllu"
UD_23_KOREAN_KAIST_TEST = _UD_23_HOME + "#UD_Korean-Kaist/ko_kaist-ud-test.conllu"
UD_23_KOREAN_PUD_TEST = _UD_23_HOME + "#UD_Korean-PUD/ko_pud-ud-test.conllu"
UD_23_KURMANJI_MG_TRAIN = _UD_23_HOME + "#UD_Kurmanji-MG/kmr_mg-ud-train.conllu"
UD_23_KURMANJI_MG_TEST = _UD_23_HOME + "#UD_Kurmanji-MG/kmr_mg-ud-test.conllu"
UD_23_LATIN_ITTB_TRAIN = _UD_23_HOME + "#UD_Latin-ITTB/la_ittb-ud-train.conllu"
UD_23_LATIN_ITTB_DEV = _UD_23_HOME + "#UD_Latin-ITTB/la_ittb-ud-dev.conllu"
UD_23_LATIN_ITTB_TEST = _UD_23_HOME + "#UD_Latin-ITTB/la_ittb-ud-test.conllu"
UD_23_LATIN_PROIEL_TRAIN = _UD_23_HOME + "#UD_Latin-PROIEL/la_proiel-ud-train.conllu"
UD_23_LATIN_PROIEL_DEV = _UD_23_HOME + "#UD_Latin-PROIEL/la_proiel-ud-dev.conllu"
UD_23_LATIN_PROIEL_TEST = _UD_23_HOME + "#UD_Latin-PROIEL/la_proiel-ud-test.conllu"
UD_23_LATIN_PERSEUS_TRAIN = _UD_23_HOME + "#UD_Latin-Perseus/la_perseus-ud-train.conllu"
UD_23_LATIN_PERSEUS_TEST = _UD_23_HOME + "#UD_Latin-Perseus/la_perseus-ud-test.conllu"
UD_23_LATVIAN_LVTB_TRAIN = _UD_23_HOME + "#UD_Latvian-LVTB/lv_lvtb-ud-train.conllu"
UD_23_LATVIAN_LVTB_DEV = _UD_23_HOME + "#UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu"
UD_23_LATVIAN_LVTB_TEST = _UD_23_HOME + "#UD_Latvian-LVTB/lv_lvtb-ud-test.conllu"
UD_23_LITHUANIAN_HSE_TRAIN = _UD_23_HOME + "#UD_Lithuanian-HSE/lt_hse-ud-train.conllu"
UD_23_LITHUANIAN_HSE_DEV = _UD_23_HOME + "#UD_Lithuanian-HSE/lt_hse-ud-dev.conllu"
UD_23_LITHUANIAN_HSE_TEST = _UD_23_HOME + "#UD_Lithuanian-HSE/lt_hse-ud-test.conllu"
UD_23_MALTESE_MUDT_TRAIN = _UD_23_HOME + "#UD_Maltese-MUDT/mt_mudt-ud-train.conllu"
UD_23_MALTESE_MUDT_DEV = _UD_23_HOME + "#UD_Maltese-MUDT/mt_mudt-ud-dev.conllu"
UD_23_MALTESE_MUDT_TEST = _UD_23_HOME + "#UD_Maltese-MUDT/mt_mudt-ud-test.conllu"
UD_23_MARATHI_UFAL_TRAIN = _UD_23_HOME + "#UD_Marathi-UFAL/mr_ufal-ud-train.conllu"
UD_23_MARATHI_UFAL_DEV = _UD_23_HOME + "#UD_Marathi-UFAL/mr_ufal-ud-dev.conllu"
UD_23_MARATHI_UFAL_TEST = _UD_23_HOME + "#UD_Marathi-UFAL/mr_ufal-ud-test.conllu"
UD_23_NAIJA_NSC_TEST = _UD_23_HOME + "#UD_Naija-NSC/pcm_nsc-ud-test.conllu"
UD_23_NORTH_SAMI_GIELLA_TRAIN = _UD_23_HOME + "#UD_North_Sami-Giella/sme_giella-ud-train.conllu"
UD_23_NORTH_SAMI_GIELLA_TEST = _UD_23_HOME + "#UD_North_Sami-Giella/sme_giella-ud-test.conllu"
UD_23_NORWEGIAN_BOKMAAL_TRAIN = _UD_23_HOME + "#UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu"
UD_23_NORWEGIAN_BOKMAAL_DEV = _UD_23_HOME + "#UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu"
UD_23_NORWEGIAN_BOKMAAL_TEST = _UD_23_HOME + "#UD_Norwegian-Bokmaal/no_bokmaal-ud-test.conllu"
UD_23_NORWEGIAN_NYNORSK_TRAIN = _UD_23_HOME + "#UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu"
UD_23_NORWEGIAN_NYNORSK_DEV = _UD_23_HOME + "#UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu"
UD_23_NORWEGIAN_NYNORSK_TEST = _UD_23_HOME + "#UD_Norwegian-Nynorsk/no_nynorsk-ud-test.conllu"
UD_23_NORWEGIAN_NYNORSKLIA_TRAIN = _UD_23_HOME + "#UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu"
UD_23_NORWEGIAN_NYNORSKLIA_TEST = _UD_23_HOME + "#UD_Norwegian-NynorskLIA/no_nynorsklia-ud-test.conllu"
UD_23_OLD_CHURCH_SLAVONIC_PROIEL_TRAIN = _UD_23_HOME + "#UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu"
UD_23_OLD_CHURCH_SLAVONIC_PROIEL_DEV = _UD_23_HOME + "#UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu"
UD_23_OLD_CHURCH_SLAVONIC_PROIEL_TEST = _UD_23_HOME + "#UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-test.conllu"
UD_23_OLD_FRENCH_SRCMF_TRAIN = _UD_23_HOME + "#UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu"
UD_23_OLD_FRENCH_SRCMF_DEV = _UD_23_HOME + "#UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu"
UD_23_OLD_FRENCH_SRCMF_TEST = _UD_23_HOME + "#UD_Old_French-SRCMF/fro_srcmf-ud-test.conllu"
UD_23_PERSIAN_SERAJI_TRAIN = _UD_23_HOME + "#UD_Persian-Seraji/fa_seraji-ud-train.conllu"
UD_23_PERSIAN_SERAJI_DEV = _UD_23_HOME + "#UD_Persian-Seraji/fa_seraji-ud-dev.conllu"
UD_23_PERSIAN_SERAJI_TEST = _UD_23_HOME + "#UD_Persian-Seraji/fa_seraji-ud-test.conllu"
UD_23_POLISH_LFG_TRAIN = _UD_23_HOME + "#UD_Polish-LFG/pl_lfg-ud-train.conllu"
UD_23_POLISH_LFG_DEV = _UD_23_HOME + "#UD_Polish-LFG/pl_lfg-ud-dev.conllu"
UD_23_POLISH_LFG_TEST = _UD_23_HOME + "#UD_Polish-LFG/pl_lfg-ud-test.conllu"
UD_23_POLISH_SZ_TRAIN = _UD_23_HOME + "#UD_Polish-SZ/pl_sz-ud-train.conllu"
UD_23_POLISH_SZ_DEV = _UD_23_HOME + "#UD_Polish-SZ/pl_sz-ud-dev.conllu"
UD_23_POLISH_SZ_TEST = _UD_23_HOME + "#UD_Polish-SZ/pl_sz-ud-test.conllu"
UD_23_PORTUGUESE_BOSQUE_TRAIN = _UD_23_HOME + "#UD_Portuguese-Bosque/pt_bosque-ud-train.conllu"
UD_23_PORTUGUESE_BOSQUE_DEV = _UD_23_HOME + "#UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu"
UD_23_PORTUGUESE_BOSQUE_TEST = _UD_23_HOME + "#UD_Portuguese-Bosque/pt_bosque-ud-test.conllu"
UD_23_PORTUGUESE_GSD_TRAIN = _UD_23_HOME + "#UD_Portuguese-GSD/pt_gsd-ud-train.conllu"
UD_23_PORTUGUESE_GSD_DEV = _UD_23_HOME + "#UD_Portuguese-GSD/pt_gsd-ud-dev.conllu"
UD_23_PORTUGUESE_GSD_TEST = _UD_23_HOME + "#UD_Portuguese-GSD/pt_gsd-ud-test.conllu"
UD_23_PORTUGUESE_PUD_TEST = _UD_23_HOME + "#UD_Portuguese-PUD/pt_pud-ud-test.conllu"
UD_23_ROMANIAN_NONSTANDARD_TRAIN = _UD_23_HOME + "#UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu"
UD_23_ROMANIAN_NONSTANDARD_DEV = _UD_23_HOME + "#UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu"
UD_23_ROMANIAN_NONSTANDARD_TEST = _UD_23_HOME + "#UD_Romanian-Nonstandard/ro_nonstandard-ud-test.conllu"
UD_23_ROMANIAN_RRT_TRAIN = _UD_23_HOME + "#UD_Romanian-RRT/ro_rrt-ud-train.conllu"
UD_23_ROMANIAN_RRT_DEV = _UD_23_HOME + "#UD_Romanian-RRT/ro_rrt-ud-dev.conllu"
UD_23_ROMANIAN_RRT_TEST = _UD_23_HOME + "#UD_Romanian-RRT/ro_rrt-ud-test.conllu"
UD_23_RUSSIAN_GSD_TRAIN = _UD_23_HOME + "#UD_Russian-GSD/ru_gsd-ud-train.conllu"
UD_23_RUSSIAN_GSD_DEV = _UD_23_HOME + "#UD_Russian-GSD/ru_gsd-ud-dev.conllu"
UD_23_RUSSIAN_GSD_TEST = _UD_23_HOME + "#UD_Russian-GSD/ru_gsd-ud-test.conllu"
UD_23_RUSSIAN_PUD_TEST = _UD_23_HOME + "#UD_Russian-PUD/ru_pud-ud-test.conllu"
UD_23_RUSSIAN_SYNTAGRUS_TRAIN = _UD_23_HOME + "#UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu"
UD_23_RUSSIAN_SYNTAGRUS_DEV = _UD_23_HOME + "#UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu"
UD_23_RUSSIAN_SYNTAGRUS_TEST = _UD_23_HOME + "#UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu"
UD_23_RUSSIAN_TAIGA_TRAIN = _UD_23_HOME + "#UD_Russian-Taiga/ru_taiga-ud-train.conllu"
UD_23_RUSSIAN_TAIGA_TEST = _UD_23_HOME + "#UD_Russian-Taiga/ru_taiga-ud-test.conllu"
UD_23_SANSKRIT_UFAL_TEST = _UD_23_HOME + "#UD_Sanskrit-UFAL/sa_ufal-ud-test.conllu"
UD_23_SERBIAN_SET_TRAIN = _UD_23_HOME + "#UD_Serbian-SET/sr_set-ud-train.conllu"
UD_23_SERBIAN_SET_DEV = _UD_23_HOME + "#UD_Serbian-SET/sr_set-ud-dev.conllu"
UD_23_SERBIAN_SET_TEST = _UD_23_HOME + "#UD_Serbian-SET/sr_set-ud-test.conllu"
UD_23_SLOVAK_SNK_TRAIN = _UD_23_HOME + "#UD_Slovak-SNK/sk_snk-ud-train.conllu"
UD_23_SLOVAK_SNK_DEV = _UD_23_HOME + "#UD_Slovak-SNK/sk_snk-ud-dev.conllu"
UD_23_SLOVAK_SNK_TEST = _UD_23_HOME + "#UD_Slovak-SNK/sk_snk-ud-test.conllu"
UD_23_SLOVENIAN_SSJ_TRAIN = _UD_23_HOME + "#UD_Slovenian-SSJ/sl_ssj-ud-train.conllu"
UD_23_SLOVENIAN_SSJ_DEV = _UD_23_HOME + "#UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu"
UD_23_SLOVENIAN_SSJ_TEST = _UD_23_HOME + "#UD_Slovenian-SSJ/sl_ssj-ud-test.conllu"
UD_23_SLOVENIAN_SST_TRAIN = _UD_23_HOME + "#UD_Slovenian-SST/sl_sst-ud-train.conllu"
UD_23_SLOVENIAN_SST_TEST = _UD_23_HOME + "#UD_Slovenian-SST/sl_sst-ud-test.conllu"
UD_23_SPANISH_ANCORA_TRAIN = _UD_23_HOME + "#UD_Spanish-AnCora/es_ancora-ud-train.conllu"
UD_23_SPANISH_ANCORA_DEV = _UD_23_HOME + "#UD_Spanish-AnCora/es_ancora-ud-dev.conllu"
UD_23_SPANISH_ANCORA_TEST = _UD_23_HOME + "#UD_Spanish-AnCora/es_ancora-ud-test.conllu"
UD_23_SPANISH_GSD_TRAIN = _UD_23_HOME + "#UD_Spanish-GSD/es_gsd-ud-train.conllu"
UD_23_SPANISH_GSD_DEV = _UD_23_HOME + "#UD_Spanish-GSD/es_gsd-ud-dev.conllu"
UD_23_SPANISH_GSD_TEST = _UD_23_HOME + "#UD_Spanish-GSD/es_gsd-ud-test.conllu"
UD_23_SPANISH_PUD_TEST = _UD_23_HOME + "#UD_Spanish-PUD/es_pud-ud-test.conllu"
UD_23_SWEDISH_LINES_TRAIN = _UD_23_HOME + "#UD_Swedish-LinES/sv_lines-ud-train.conllu"
UD_23_SWEDISH_LINES_DEV = _UD_23_HOME + "#UD_Swedish-LinES/sv_lines-ud-dev.conllu"
UD_23_SWEDISH_LINES_TEST = _UD_23_HOME + "#UD_Swedish-LinES/sv_lines-ud-test.conllu"
UD_23_SWEDISH_PUD_TEST = _UD_23_HOME + "#UD_Swedish-PUD/sv_pud-ud-test.conllu"
UD_23_SWEDISH_TALBANKEN_TRAIN = _UD_23_HOME + "#UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu"
UD_23_SWEDISH_TALBANKEN_DEV = _UD_23_HOME + "#UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu"
UD_23_SWEDISH_TALBANKEN_TEST = _UD_23_HOME + "#UD_Swedish-Talbanken/sv_talbanken-ud-test.conllu"
UD_23_SWEDISH_SIGN_LANGUAGE_SSLC_TRAIN = _UD_23_HOME + "#UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu"
UD_23_SWEDISH_SIGN_LANGUAGE_SSLC_DEV = _UD_23_HOME + "#UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu"
UD_23_SWEDISH_SIGN_LANGUAGE_SSLC_TEST = _UD_23_HOME + "#UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-test.conllu"
UD_23_TAGALOG_TRG_TEST = _UD_23_HOME + "#UD_Tagalog-TRG/tl_trg-ud-test.conllu"
UD_23_TAMIL_TTB_TRAIN = _UD_23_HOME + "#UD_Tamil-TTB/ta_ttb-ud-train.conllu"
UD_23_TAMIL_TTB_DEV = _UD_23_HOME + "#UD_Tamil-TTB/ta_ttb-ud-dev.conllu"
UD_23_TAMIL_TTB_TEST = _UD_23_HOME + "#UD_Tamil-TTB/ta_ttb-ud-test.conllu"
UD_23_TELUGU_MTG_TRAIN = _UD_23_HOME + "#UD_Telugu-MTG/te_mtg-ud-train.conllu"
UD_23_TELUGU_MTG_DEV = _UD_23_HOME + "#UD_Telugu-MTG/te_mtg-ud-dev.conllu"
UD_23_TELUGU_MTG_TEST = _UD_23_HOME + "#UD_Telugu-MTG/te_mtg-ud-test.conllu"
UD_23_THAI_PUD_TEST = _UD_23_HOME + "#UD_Thai-PUD/th_pud-ud-test.conllu"
UD_23_TURKISH_IMST_TRAIN = _UD_23_HOME + "#UD_Turkish-IMST/tr_imst-ud-train.conllu"
UD_23_TURKISH_IMST_DEV = _UD_23_HOME + "#UD_Turkish-IMST/tr_imst-ud-dev.conllu"
UD_23_TURKISH_IMST_TEST = _UD_23_HOME + "#UD_Turkish-IMST/tr_imst-ud-test.conllu"
UD_23_TURKISH_PUD_TEST = _UD_23_HOME + "#UD_Turkish-PUD/tr_pud-ud-test.conllu"
UD_23_UKRAINIAN_IU_TRAIN = _UD_23_HOME + "#UD_Ukrainian-IU/uk_iu-ud-train.conllu"
UD_23_UKRAINIAN_IU_DEV = _UD_23_HOME + "#UD_Ukrainian-IU/uk_iu-ud-dev.conllu"
UD_23_UKRAINIAN_IU_TEST = _UD_23_HOME + "#UD_Ukrainian-IU/uk_iu-ud-test.conllu"
UD_23_UPPER_SORBIAN_UFAL_TRAIN = _UD_23_HOME + "#UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu"
UD_23_UPPER_SORBIAN_UFAL_TEST = _UD_23_HOME + "#UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu"
UD_23_URDU_UDTB_TRAIN = _UD_23_HOME + "#UD_Urdu-UDTB/ur_udtb-ud-train.conllu"
UD_23_URDU_UDTB_DEV = _UD_23_HOME + "#UD_Urdu-UDTB/ur_udtb-ud-dev.conllu"
UD_23_URDU_UDTB_TEST = _UD_23_HOME + "#UD_Urdu-UDTB/ur_udtb-ud-test.conllu"
UD_23_UYGHUR_UDT_TRAIN = _UD_23_HOME + "#UD_Uyghur-UDT/ug_udt-ud-train.conllu"
UD_23_UYGHUR_UDT_DEV = _UD_23_HOME + "#UD_Uyghur-UDT/ug_udt-ud-dev.conllu"
UD_23_UYGHUR_UDT_TEST = _UD_23_HOME + "#UD_Uyghur-UDT/ug_udt-ud-test.conllu"
UD_23_VIETNAMESE_VTB_TRAIN = _UD_23_HOME + "#UD_Vietnamese-VTB/vi_vtb-ud-train.conllu"
UD_23_VIETNAMESE_VTB_DEV = _UD_23_HOME + "#UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu"
UD_23_VIETNAMESE_VTB_TEST = _UD_23_HOME + "#UD_Vietnamese-VTB/vi_vtb-ud-test.conllu"
UD_23_WARLPIRI_UFAL_TEST = _UD_23_HOME + "#UD_Warlpiri-UFAL/wbp_ufal-ud-test.conllu"
UD_23_YORUBA_YTB_TEST = _UD_23_HOME + "#UD_Yoruba-YTB/yo_ytb-ud-test.conllu"


================================================
FILE: hanlp/datasets/parsing/ud/ud23m.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-21 20:39
import os

from hanlp.datasets.parsing.ud import concat_treebanks
from .ud23 import _UD_23_HOME

_UD_23_MULTILINGUAL_HOME = concat_treebanks(_UD_23_HOME, '2.3')
UD_23_MULTILINGUAL_TRAIN = os.path.join(_UD_23_MULTILINGUAL_HOME, 'train.conllu')
UD_23_MULTILINGUAL_DEV = os.path.join(_UD_23_MULTILINGUAL_HOME, 'dev.conllu')
UD_23_MULTILINGUAL_TEST = os.path.join(_UD_23_MULTILINGUAL_HOME, 'test.conllu')


================================================
FILE: hanlp/datasets/parsing/ud/ud27.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-07 21:03
import glob
import os

from hanlp.utils.io_util import uncompress, get_resource

_UD_27_URL = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3424/allzip"
_UD_27_HOME = _UD_27_URL + '#ud-treebanks-v2.7/'
_path = get_resource(_UD_27_URL)
if os.path.isfile(_path):
    os.rename(_path, _path + '.zip')
    uncompress(_path + '.zip')
    uncompress(os.path.join(_path, 'ud-treebanks-v2.7.tgz'))


# noinspection PyShadowingNames
def _list_dir(path, home):
    prefix = home.lstrip('_').replace('_HOME', '')

    path = get_resource(path)
    with open('ud27.py', 'a') as out:
        for f in sorted(glob.glob(path + '/ud-treebanks-v2.7/UD_*')):
            basename = os.path.basename(f)
            name = basename[len('UD_'):]
            name = name.upper().replace('-', '_')
            for split in 'train', 'dev', 'test':
                sp = glob.glob(f + f'/*{split}.conllu')
                if not sp:
                    continue
                sp = os.path.basename(sp[0])
                out.write(f'{prefix}_{name}_{split.upper()} = {home} + "{basename}/{sp}"\n')
                out.write(f'"{prefix} {split} set of {name}."\n')


def main():
    _list_dir(_UD_27_URL, '_UD_27_HOME')
    pass


if __name__ == '__main__':
    main()
UD_27_AFRIKAANS_AFRIBOOMS_TRAIN = _UD_27_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu"
"UD_27 train set of AFRIKAANS_AFRIBOOMS."
UD_27_AFRIKAANS_AFRIBOOMS_DEV = _UD_27_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu"
"UD_27 dev set of AFRIKAANS_AFRIBOOMS."
UD_27_AFRIKAANS_AFRIBOOMS_TEST = _UD_27_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu"
"UD_27 test set of AFRIKAANS_AFRIBOOMS."
UD_27_AKKADIAN_PISANDUB_TEST = _UD_27_HOME + "UD_Akkadian-PISANDUB/akk_pisandub-ud-test.conllu"
"UD_27 test set of AKKADIAN_PISANDUB."
UD_27_AKKADIAN_RIAO_TEST = _UD_27_HOME + "UD_Akkadian-RIAO/akk_riao-ud-test.conllu"
"UD_27 test set of AKKADIAN_RIAO."
UD_27_AKUNTSU_TUDET_TEST = _UD_27_HOME + "UD_Akuntsu-TuDeT/aqz_tudet-ud-test.conllu"
"UD_27 test set of AKUNTSU_TUDET."
UD_27_ALBANIAN_TSA_TEST = _UD_27_HOME + "UD_Albanian-TSA/sq_tsa-ud-test.conllu"
"UD_27 test set of ALBANIAN_TSA."
UD_27_AMHARIC_ATT_TEST = _UD_27_HOME + "UD_Amharic-ATT/am_att-ud-test.conllu"
"UD_27 test set of AMHARIC_ATT."
UD_27_ANCIENT_GREEK_PROIEL_TRAIN = _UD_27_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu"
"UD_27 train set of ANCIENT_GREEK_PROIEL."
UD_27_ANCIENT_GREEK_PROIEL_DEV = _UD_27_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu"
"UD_27 dev set of ANCIENT_GREEK_PROIEL."
UD_27_ANCIENT_GREEK_PROIEL_TEST = _UD_27_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-test.conllu"
"UD_27 test set of ANCIENT_GREEK_PROIEL."
UD_27_ANCIENT_GREEK_PERSEUS_TRAIN = _UD_27_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu"
"UD_27 train set of ANCIENT_GREEK_PERSEUS."
UD_27_ANCIENT_GREEK_PERSEUS_DEV = _UD_27_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu"
"UD_27 dev set of ANCIENT_GREEK_PERSEUS."
UD_27_ANCIENT_GREEK_PERSEUS_TEST = _UD_27_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-test.conllu"
"UD_27 test set of ANCIENT_GREEK_PERSEUS."
UD_27_APURINA_UFPA_TEST = _UD_27_HOME + "UD_Apurina-UFPA/apu_ufpa-ud-test.conllu"
"UD_27 test set of APURINA_UFPA."
UD_27_ARABIC_NYUAD_TRAIN = _UD_27_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-train.conllu"
"UD_27 train set of ARABIC_NYUAD."
UD_27_ARABIC_NYUAD_DEV = _UD_27_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-dev.conllu"
"UD_27 dev set of ARABIC_NYUAD."
UD_27_ARABIC_NYUAD_TEST = _UD_27_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-test.conllu"
"UD_27 test set of ARABIC_NYUAD."
UD_27_ARABIC_PADT_TRAIN = _UD_27_HOME + "UD_Arabic-PADT/ar_padt-ud-train.conllu"
"UD_27 train set of ARABIC_PADT."
UD_27_ARABIC_PADT_DEV = _UD_27_HOME + "UD_Arabic-PADT/ar_padt-ud-dev.conllu"
"UD_27 dev set of ARABIC_PADT."
UD_27_ARABIC_PADT_TEST = _UD_27_HOME + "UD_Arabic-PADT/ar_padt-ud-test.conllu"
"UD_27 test set of ARABIC_PADT."
UD_27_ARABIC_PUD_TEST = _UD_27_HOME + "UD_Arabic-PUD/ar_pud-ud-test.conllu"
"UD_27 test set of ARABIC_PUD."
UD_27_ARMENIAN_ARMTDP_TRAIN = _UD_27_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu"
"UD_27 train set of ARMENIAN_ARMTDP."
UD_27_ARMENIAN_ARMTDP_DEV = _UD_27_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-dev.conllu"
"UD_27 dev set of ARMENIAN_ARMTDP."
UD_27_ARMENIAN_ARMTDP_TEST = _UD_27_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-test.conllu"
"UD_27 test set of ARMENIAN_ARMTDP."
UD_27_ASSYRIAN_AS_TEST = _UD_27_HOME + "UD_Assyrian-AS/aii_as-ud-test.conllu"
"UD_27 test set of ASSYRIAN_AS."
UD_27_BAMBARA_CRB_TEST = _UD_27_HOME + "UD_Bambara-CRB/bm_crb-ud-test.conllu"
"UD_27 test set of BAMBARA_CRB."
UD_27_BASQUE_BDT_TRAIN = _UD_27_HOME + "UD_Basque-BDT/eu_bdt-ud-train.conllu"
"UD_27 train set of BASQUE_BDT."
UD_27_BASQUE_BDT_DEV = _UD_27_HOME + "UD_Basque-BDT/eu_bdt-ud-dev.conllu"
"UD_27 dev set of BASQUE_BDT."
UD_27_BASQUE_BDT_TEST = _UD_27_HOME + "UD_Basque-BDT/eu_bdt-ud-test.conllu"
"UD_27 test set of BASQUE_BDT."
UD_27_BELARUSIAN_HSE_TRAIN = _UD_27_HOME + "UD_Belarusian-HSE/be_hse-ud-train.conllu"
"UD_27 train set of BELARUSIAN_HSE."
UD_27_BELARUSIAN_HSE_DEV = _UD_27_HOME + "UD_Belarusian-HSE/be_hse-ud-dev.conllu"
"UD_27 dev set of BELARUSIAN_HSE."
UD_27_BELARUSIAN_HSE_TEST = _UD_27_HOME + "UD_Belarusian-HSE/be_hse-ud-test.conllu"
"UD_27 test set of BELARUSIAN_HSE."
UD_27_BHOJPURI_BHTB_TEST = _UD_27_HOME + "UD_Bhojpuri-BHTB/bho_bhtb-ud-test.conllu"
"UD_27 test set of BHOJPURI_BHTB."
UD_27_BRETON_KEB_TEST = _UD_27_HOME + "UD_Breton-KEB/br_keb-ud-test.conllu"
"UD_27 test set of BRETON_KEB."
UD_27_BULGARIAN_BTB_TRAIN = _UD_27_HOME + "UD_Bulgarian-BTB/bg_btb-ud-train.conllu"
"UD_27 train set of BULGARIAN_BTB."
UD_27_BULGARIAN_BTB_DEV = _UD_27_HOME + "UD_Bulgarian-BTB/bg_btb-ud-dev.conllu"
"UD_27 dev set of BULGARIAN_BTB."
UD_27_BULGARIAN_BTB_TEST = _UD_27_HOME + "UD_Bulgarian-BTB/bg_btb-ud-test.conllu"
"UD_27 test set of BULGARIAN_BTB."
UD_27_BURYAT_BDT_TRAIN = _UD_27_HOME + "UD_Buryat-BDT/bxr_bdt-ud-train.conllu"
"UD_27 train set of BURYAT_BDT."
UD_27_BURYAT_BDT_TEST = _UD_27_HOME + "UD_Buryat-BDT/bxr_bdt-ud-test.conllu"
"UD_27 test set of BURYAT_BDT."
UD_27_CANTONESE_HK_TEST = _UD_27_HOME + "UD_Cantonese-HK/yue_hk-ud-test.conllu"
"UD_27 test set of CANTONESE_HK."
UD_27_CATALAN_ANCORA_TRAIN = _UD_27_HOME + "UD_Catalan-AnCora/ca_ancora-ud-train.conllu"
"UD_27 train set of CATALAN_ANCORA."
UD_27_CATALAN_ANCORA_DEV = _UD_27_HOME + "UD_Catalan-AnCora/ca_ancora-ud-dev.conllu"
"UD_27 dev set of CATALAN_ANCORA."
UD_27_CATALAN_ANCORA_TEST = _UD_27_HOME + "UD_Catalan-AnCora/ca_ancora-ud-test.conllu"
"UD_27 test set of CATALAN_ANCORA."
UD_27_CHINESE_CFL_TEST = _UD_27_HOME + "UD_Chinese-CFL/zh_cfl-ud-test.conllu"
"UD_27 test set of CHINESE_CFL."
UD_27_CHINESE_GSD_TRAIN = _UD_27_HOME + "UD_Chinese-GSD/zh_gsd-ud-train.conllu"
"UD_27 train set of CHINESE_GSD."
UD_27_CHINESE_GSD_DEV = _UD_27_HOME + "UD_Chinese-GSD/zh_gsd-ud-dev.conllu"
"UD_27 dev set of CHINESE_GSD."
UD_27_CHINESE_GSD_TEST = _UD_27_HOME + "UD_Chinese-GSD/zh_gsd-ud-test.conllu"
"UD_27 test set of CHINESE_GSD."
UD_27_CHINESE_GSDSIMP_TRAIN = _UD_27_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-train.conllu"
"UD_27 train set of CHINESE_GSDSIMP."
UD_27_CHINESE_GSDSIMP_DEV = _UD_27_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-dev.conllu"
"UD_27 dev set of CHINESE_GSDSIMP."
UD_27_CHINESE_GSDSIMP_TEST = _UD_27_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-test.conllu"
"UD_27 test set of CHINESE_GSDSIMP."
UD_27_CHINESE_HK_TEST = _UD_27_HOME + "UD_Chinese-HK/zh_hk-ud-test.conllu"
"UD_27 test set of CHINESE_HK."
UD_27_CHINESE_PUD_TEST = _UD_27_HOME + "UD_Chinese-PUD/zh_pud-ud-test.conllu"
"UD_27 test set of CHINESE_PUD."
UD_27_CHUKCHI_HSE_TEST = _UD_27_HOME + "UD_Chukchi-HSE/ckt_hse-ud-test.conllu"
"UD_27 test set of CHUKCHI_HSE."
UD_27_CLASSICAL_CHINESE_KYOTO_TRAIN = _UD_27_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-train.conllu"
"UD_27 train set of CLASSICAL_CHINESE_KYOTO."
UD_27_CLASSICAL_CHINESE_KYOTO_DEV = _UD_27_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-dev.conllu"
"UD_27 dev set of CLASSICAL_CHINESE_KYOTO."
UD_27_CLASSICAL_CHINESE_KYOTO_TEST = _UD_27_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-test.conllu"
"UD_27 test set of CLASSICAL_CHINESE_KYOTO."
UD_27_COPTIC_SCRIPTORIUM_TRAIN = _UD_27_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu"
"UD_27 train set of COPTIC_SCRIPTORIUM."
UD_27_COPTIC_SCRIPTORIUM_DEV = _UD_27_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu"
"UD_27 dev set of COPTIC_SCRIPTORIUM."
UD_27_COPTIC_SCRIPTORIUM_TEST = _UD_27_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-test.conllu"
"UD_27 test set of COPTIC_SCRIPTORIUM."
UD_27_CROATIAN_SET_TRAIN = _UD_27_HOME + "UD_Croatian-SET/hr_set-ud-train.conllu"
"UD_27 train set of CROATIAN_SET."
UD_27_CROATIAN_SET_DEV = _UD_27_HOME + "UD_Croatian-SET/hr_set-ud-dev.conllu"
"UD_27 dev set of CROATIAN_SET."
UD_27_CROATIAN_SET_TEST = _UD_27_HOME + "UD_Croatian-SET/hr_set-ud-test.conllu"
"UD_27 test set of CROATIAN_SET."
UD_27_CZECH_CAC_TRAIN = _UD_27_HOME + "UD_Czech-CAC/cs_cac-ud-train.conllu"
"UD_27 train set of CZECH_CAC."
UD_27_CZECH_CAC_DEV = _UD_27_HOME + "UD_Czech-CAC/cs_cac-ud-dev.conllu"
"UD_27 dev set of CZECH_CAC."
UD_27_CZECH_CAC_TEST = _UD_27_HOME + "UD_Czech-CAC/cs_cac-ud-test.conllu"
"UD_27 test set of CZECH_CAC."
UD_27_CZECH_CLTT_TRAIN = _UD_27_HOME + "UD_Czech-CLTT/cs_cltt-ud-train.conllu"
"UD_27 train set of CZECH_CLTT."
UD_27_CZECH_CLTT_DEV = _UD_27_HOME + "UD_Czech-CLTT/cs_cltt-ud-dev.conllu"
"UD_27 dev set of CZECH_CLTT."
UD_27_CZECH_CLTT_TEST = _UD_27_HOME + "UD_Czech-CLTT/cs_cltt-ud-test.conllu"
"UD_27 test set of CZECH_CLTT."
UD_27_CZECH_FICTREE_TRAIN = _UD_27_HOME + "UD_Czech-FicTree/cs_fictree-ud-train.conllu"
"UD_27 train set of CZECH_FICTREE."
UD_27_CZECH_FICTREE_DEV = _UD_27_HOME + "UD_Czech-FicTree/cs_fictree-ud-dev.conllu"
"UD_27 dev set of CZECH_FICTREE."
UD_27_CZECH_FICTREE_TEST = _UD_27_HOME + "UD_Czech-FicTree/cs_fictree-ud-test.conllu"
"UD_27 test set of CZECH_FICTREE."
UD_27_CZECH_PDT_TRAIN = _UD_27_HOME + "UD_Czech-PDT/cs_pdt-ud-train.conllu"
"UD_27 train set of CZECH_PDT."
UD_27_CZECH_PDT_DEV = _UD_27_HOME + "UD_Czech-PDT/cs_pdt-ud-dev.conllu"
"UD_27 dev set of CZECH_PDT."
UD_27_CZECH_PDT_TEST = _UD_27_HOME + "UD_Czech-PDT/cs_pdt-ud-test.conllu"
"UD_27 test set of CZECH_PDT."
UD_27_CZECH_PUD_TEST = _UD_27_HOME + "UD_Czech-PUD/cs_pud-ud-test.conllu"
"UD_27 test set of CZECH_PUD."
UD_27_DANISH_DDT_TRAIN = _UD_27_HOME + "UD_Danish-DDT/da_ddt-ud-train.conllu"
"UD_27 train set of DANISH_DDT."
UD_27_DANISH_DDT_DEV = _UD_27_HOME + "UD_Danish-DDT/da_ddt-ud-dev.conllu"
"UD_27 dev set of DANISH_DDT."
UD_27_DANISH_DDT_TEST = _UD_27_HOME + "UD_Danish-DDT/da_ddt-ud-test.conllu"
"UD_27 test set of DANISH_DDT."
UD_27_DUTCH_ALPINO_TRAIN = _UD_27_HOME + "UD_Dutch-Alpino/nl_alpino-ud-train.conllu"
"UD_27 train set of DUTCH_ALPINO."
UD_27_DUTCH_ALPINO_DEV = _UD_27_HOME + "UD_Dutch-Alpino/nl_alpino-ud-dev.conllu"
"UD_27 dev set of DUTCH_ALPINO."
UD_27_DUTCH_ALPINO_TEST = _UD_27_HOME + "UD_Dutch-Alpino/nl_alpino-ud-test.conllu"
"UD_27 test set of DUTCH_ALPINO."
UD_27_DUTCH_LASSYSMALL_TRAIN = _UD_27_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu"
"UD_27 train set of DUTCH_LASSYSMALL."
UD_27_DUTCH_LASSYSMALL_DEV = _UD_27_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu"
"UD_27 dev set of DUTCH_LASSYSMALL."
UD_27_DUTCH_LASSYSMALL_TEST = _UD_27_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-test.conllu"
"UD_27 test set of DUTCH_LASSYSMALL."
UD_27_ENGLISH_ESL_TRAIN = _UD_27_HOME + "UD_English-ESL/en_esl-ud-train.conllu"
"UD_27 train set of ENGLISH_ESL."
UD_27_ENGLISH_ESL_DEV = _UD_27_HOME + "UD_English-ESL/en_esl-ud-dev.conllu"
"UD_27 dev set of ENGLISH_ESL."
UD_27_ENGLISH_ESL_TEST = _UD_27_HOME + "UD_English-ESL/en_esl-ud-test.conllu"
"UD_27 test set of ENGLISH_ESL."
UD_27_ENGLISH_EWT_TRAIN = _UD_27_HOME + "UD_English-EWT/en_ewt-ud-train.conllu"
"UD_27 train set of ENGLISH_EWT."
UD_27_ENGLISH_EWT_DEV = _UD_27_HOME + "UD_English-EWT/en_ewt-ud-dev.conllu"
"UD_27 dev set of ENGLISH_EWT."
UD_27_ENGLISH_EWT_TEST = _UD_27_HOME + "UD_English-EWT/en_ewt-ud-test.conllu"
"UD_27 test set of ENGLISH_EWT."
UD_27_ENGLISH_GUM_TRAIN = _UD_27_HOME + "UD_English-GUM/en_gum-ud-train.conllu"
"UD_27 train set of ENGLISH_GUM."
UD_27_ENGLISH_GUM_DEV = _UD_27_HOME + "UD_English-GUM/en_gum-ud-dev.conllu"
"UD_27 dev set of ENGLISH_GUM."
UD_27_ENGLISH_GUM_TEST = _UD_27_HOME + "UD_English-GUM/en_gum-ud-test.conllu"
"UD_27 test set of ENGLISH_GUM."
UD_27_ENGLISH_GUMREDDIT_TRAIN = _UD_27_HOME + "UD_English-GUMReddit/en_gumreddit-ud-train.conllu"
"UD_27 train set of ENGLISH_GUMREDDIT."
UD_27_ENGLISH_GUMREDDIT_DEV = _UD_27_HOME + "UD_English-GUMReddit/en_gumreddit-ud-dev.conllu"
"UD_27 dev set of ENGLISH_GUMREDDIT."
UD_27_ENGLISH_GUMREDDIT_TEST = _UD_27_HOME + "UD_English-GUMReddit/en_gumreddit-ud-test.conllu"
"UD_27 test set of ENGLISH_GUMREDDIT."
UD_27_ENGLISH_LINES_TRAIN = _UD_27_HOME + "UD_English-LinES/en_lines-ud-train.conllu"
"UD_27 train set of ENGLISH_LINES."
UD_27_ENGLISH_LINES_DEV = _UD_27_HOME + "UD_English-LinES/en_lines-ud-dev.conllu"
"UD_27 dev set of ENGLISH_LINES."
UD_27_ENGLISH_LINES_TEST = _UD_27_HOME + "UD_English-LinES/en_lines-ud-test.conllu"
"UD_27 test set of ENGLISH_LINES."
UD_27_ENGLISH_PUD_TEST = _UD_27_HOME + "UD_English-PUD/en_pud-ud-test.conllu"
"UD_27 test set of ENGLISH_PUD."
UD_27_ENGLISH_PARTUT_TRAIN = _UD_27_HOME + "UD_English-ParTUT/en_partut-ud-train.conllu"
"UD_27 train set of ENGLISH_PARTUT."
UD_27_ENGLISH_PARTUT_DEV = _UD_27_HOME + "UD_English-ParTUT/en_partut-ud-dev.conllu"
"UD_27 dev set of ENGLISH_PARTUT."
UD_27_ENGLISH_PARTUT_TEST = _UD_27_HOME + "UD_English-ParTUT/en_partut-ud-test.conllu"
"UD_27 test set of ENGLISH_PARTUT."
UD_27_ENGLISH_PRONOUNS_TEST = _UD_27_HOME + "UD_English-Pronouns/en_pronouns-ud-test.conllu"
"UD_27 test set of ENGLISH_PRONOUNS."
UD_27_ERZYA_JR_TEST = _UD_27_HOME + "UD_Erzya-JR/myv_jr-ud-test.conllu"
"UD_27 test set of ERZYA_JR."
UD_27_ESTONIAN_EDT_TRAIN = _UD_27_HOME + "UD_Estonian-EDT/et_edt-ud-train.conllu"
"UD_27 train set of ESTONIAN_EDT."
UD_27_ESTONIAN_EDT_DEV = _UD_27_HOME + "UD_Estonian-EDT/et_edt-ud-dev.conllu"
"UD_27 dev set of ESTONIAN_EDT."
UD_27_ESTONIAN_EDT_TEST = _UD_27_HOME + "UD_Estonian-EDT/et_edt-ud-test.conllu"
"UD_27 test set of ESTONIAN_EDT."
UD_27_ESTONIAN_EWT_TRAIN = _UD_27_HOME + "UD_Estonian-EWT/et_ewt-ud-train.conllu"
"UD_27 train set of ESTONIAN_EWT."
UD_27_ESTONIAN_EWT_DEV = _UD_27_HOME + "UD_Estonian-EWT/et_ewt-ud-dev.conllu"
"UD_27 dev set of ESTONIAN_EWT."
UD_27_ESTONIAN_EWT_TEST = _UD_27_HOME + "UD_Estonian-EWT/et_ewt-ud-test.conllu"
"UD_27 test set of ESTONIAN_EWT."
UD_27_FAROESE_FARPAHC_TRAIN = _UD_27_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-train.conllu"
"UD_27 train set of FAROESE_FARPAHC."
UD_27_FAROESE_FARPAHC_DEV = _UD_27_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-dev.conllu"
"UD_27 dev set of FAROESE_FARPAHC."
UD_27_FAROESE_FARPAHC_TEST = _UD_27_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-test.conllu"
"UD_27 test set of FAROESE_FARPAHC."
UD_27_FAROESE_OFT_TEST = _UD_27_HOME + "UD_Faroese-OFT/fo_oft-ud-test.conllu"
"UD_27 test set of FAROESE_OFT."
UD_27_FINNISH_FTB_TRAIN = _UD_27_HOME + "UD_Finnish-FTB/fi_ftb-ud-train.conllu"
"UD_27 train set of FINNISH_FTB."
UD_27_FINNISH_FTB_DEV = _UD_27_HOME + "UD_Finnish-FTB/fi_ftb-ud-dev.conllu"
"UD_27 dev set of FINNISH_FTB."
UD_27_FINNISH_FTB_TEST = _UD_27_HOME + "UD_Finnish-FTB/fi_ftb-ud-test.conllu"
"UD_27 test set of FINNISH_FTB."
UD_27_FINNISH_OOD_TEST = _UD_27_HOME + "UD_Finnish-OOD/fi_ood-ud-test.conllu"
"UD_27 test set of FINNISH_OOD."
UD_27_FINNISH_PUD_TEST = _UD_27_HOME + "UD_Finnish-PUD/fi_pud-ud-test.conllu"
"UD_27 test set of FINNISH_PUD."
UD_27_FINNISH_TDT_TRAIN = _UD_27_HOME + "UD_Finnish-TDT/fi_tdt-ud-train.conllu"
"UD_27 train set of FINNISH_TDT."
UD_27_FINNISH_TDT_DEV = _UD_27_HOME + "UD_Finnish-TDT/fi_tdt-ud-dev.conllu"
"UD_27 dev set of FINNISH_TDT."
UD_27_FINNISH_TDT_TEST = _UD_27_HOME + "UD_Finnish-TDT/fi_tdt-ud-test.conllu"
"UD_27 test set of FINNISH_TDT."
UD_27_FRENCH_FQB_TEST = _UD_27_HOME + "UD_French-FQB/fr_fqb-ud-test.conllu"
"UD_27 test set of FRENCH_FQB."
UD_27_FRENCH_FTB_TRAIN = _UD_27_HOME + "UD_French-FTB/fr_ftb-ud-train.conllu"
"UD_27 train set of FRENCH_FTB."
UD_27_FRENCH_FTB_DEV = _UD_27_HOME + "UD_French-FTB/fr_ftb-ud-dev.conllu"
"UD_27 dev set of FRENCH_FTB."
UD_27_FRENCH_FTB_TEST = _UD_27_HOME + "UD_French-FTB/fr_ftb-ud-test.conllu"
"UD_27 test set of FRENCH_FTB."
UD_27_FRENCH_GSD_TRAIN = _UD_27_HOME + "UD_French-GSD/fr_gsd-ud-train.conllu"
"UD_27 train set of FRENCH_GSD."
UD_27_FRENCH_GSD_DEV = _UD_27_HOME + "UD_French-GSD/fr_gsd-ud-dev.conllu"
"UD_27 dev set of FRENCH_GSD."
UD_27_FRENCH_GSD_TEST = _UD_27_HOME + "UD_French-GSD/fr_gsd-ud-test.conllu"
"UD_27 test set of FRENCH_GSD."
UD_27_FRENCH_PUD_TEST = _UD_27_HOME + "UD_French-PUD/fr_pud-ud-test.conllu"
"UD_27 test set of FRENCH_PUD."
UD_27_FRENCH_PARTUT_TRAIN = _UD_27_HOME + "UD_French-ParTUT/fr_partut-ud-train.conllu"
"UD_27 train set of FRENCH_PARTUT."
UD_27_FRENCH_PARTUT_DEV = _UD_27_HOME + "UD_French-ParTUT/fr_partut-ud-dev.conllu"
"UD_27 dev set of FRENCH_PARTUT."
UD_27_FRENCH_PARTUT_TEST = _UD_27_HOME + "UD_French-ParTUT/fr_partut-ud-test.conllu"
"UD_27 test set of FRENCH_PARTUT."
UD_27_FRENCH_SEQUOIA_TRAIN = _UD_27_HOME + "UD_French-Sequoia/fr_sequoia-ud-train.conllu"
"UD_27 train set of FRENCH_SEQUOIA."
UD_27_FRENCH_SEQUOIA_DEV = _UD_27_HOME + "UD_French-Sequoia/fr_sequoia-ud-dev.conllu"
"UD_27 dev set of FRENCH_SEQUOIA."
UD_27_FRENCH_SEQUOIA_TEST = _UD_27_HOME + "UD_French-Sequoia/fr_sequoia-ud-test.conllu"
"UD_27 test set of FRENCH_SEQUOIA."
UD_27_FRENCH_SPOKEN_TRAIN = _UD_27_HOME + "UD_French-Spoken/fr_spoken-ud-train.conllu"
"UD_27 train set of FRENCH_SPOKEN."
UD_27_FRENCH_SPOKEN_DEV = _UD_27_HOME + "UD_French-Spoken/fr_spoken-ud-dev.conllu"
"UD_27 dev set of FRENCH_SPOKEN."
UD_27_FRENCH_SPOKEN_TEST = _UD_27_HOME + "UD_French-Spoken/fr_spoken-ud-test.conllu"
"UD_27 test set of FRENCH_SPOKEN."
UD_27_GALICIAN_CTG_TRAIN = _UD_27_HOME + "UD_Galician-CTG/gl_ctg-ud-train.conllu"
"UD_27 train set of GALICIAN_CTG."
UD_27_GALICIAN_CTG_DEV = _UD_27_HOME + "UD_Galician-CTG/gl_ctg-ud-dev.conllu"
"UD_27 dev set of GALICIAN_CTG."
UD_27_GALICIAN_CTG_TEST = _UD_27_HOME + "UD_Galician-CTG/gl_ctg-ud-test.conllu"
"UD_27 test set of GALICIAN_CTG."
UD_27_GALICIAN_TREEGAL_TRAIN = _UD_27_HOME + "UD_Galician-TreeGal/gl_treegal-ud-train.conllu"
"UD_27 train set of GALICIAN_TREEGAL."
UD_27_GALICIAN_TREEGAL_TEST = _UD_27_HOME + "UD_Galician-TreeGal/gl_treegal-ud-test.conllu"
"UD_27 test set of GALICIAN_TREEGAL."
UD_27_GERMAN_GSD_TRAIN = _UD_27_HOME + "UD_German-GSD/de_gsd-ud-train.conllu"
"UD_27 train set of GERMAN_GSD."
UD_27_GERMAN_GSD_DEV = _UD_27_HOME + "UD_German-GSD/de_gsd-ud-dev.conllu"
"UD_27 dev set of GERMAN_GSD."
UD_27_GERMAN_GSD_TEST = _UD_27_HOME + "UD_German-GSD/de_gsd-ud-test.conllu"
"UD_27 test set of GERMAN_GSD."
UD_27_GERMAN_HDT_TRAIN = _UD_27_HOME + "UD_German-HDT/de_hdt-ud-train.conllu"
"UD_27 train set of GERMAN_HDT."
UD_27_GERMAN_HDT_DEV = _UD_27_HOME + "UD_German-HDT/de_hdt-ud-dev.conllu"
"UD_27 dev set of GERMAN_HDT."
UD_27_GERMAN_HDT_TEST = _UD_27_HOME + "UD_German-HDT/de_hdt-ud-test.conllu"
"UD_27 test set of GERMAN_HDT."
UD_27_GERMAN_LIT_TEST = _UD_27_HOME + "UD_German-LIT/de_lit-ud-test.conllu"
"UD_27 test set of GERMAN_LIT."
UD_27_GERMAN_PUD_TEST = _UD_27_HOME + "UD_German-PUD/de_pud-ud-test.conllu"
"UD_27 test set of GERMAN_PUD."
UD_27_GOTHIC_PROIEL_TRAIN = _UD_27_HOME + "UD_Gothic-PROIEL/got_proiel-ud-train.conllu"
"UD_27 train set of GOTHIC_PROIEL."
UD_27_GOTHIC_PROIEL_DEV = _UD_27_HOME + "UD_Gothic-PROIEL/got_proiel-ud-dev.conllu"
"UD_27 dev set of GOTHIC_PROIEL."
UD_27_GOTHIC_PROIEL_TEST = _UD_27_HOME + "UD_Gothic-PROIEL/got_proiel-ud-test.conllu"
"UD_27 test set of GOTHIC_PROIEL."
UD_27_GREEK_GDT_TRAIN = _UD_27_HOME + "UD_Greek-GDT/el_gdt-ud-train.conllu"
"UD_27 train set of GREEK_GDT."
UD_27_GREEK_GDT_DEV = _UD_27_HOME + "UD_Greek-GDT/el_gdt-ud-dev.conllu"
"UD_27 dev set of GREEK_GDT."
UD_27_GREEK_GDT_TEST = _UD_27_HOME + "UD_Greek-GDT/el_gdt-ud-test.conllu"
"UD_27 test set of GREEK_GDT."
UD_27_HEBREW_HTB_TRAIN = _UD_27_HOME + "UD_Hebrew-HTB/he_htb-ud-train.conllu"
"UD_27 train set of HEBREW_HTB."
UD_27_HEBREW_HTB_DEV = _UD_27_HOME + "UD_Hebrew-HTB/he_htb-ud-dev.conllu"
"UD_27 dev set of HEBREW_HTB."
UD_27_HEBREW_HTB_TEST = _UD_27_HOME + "UD_Hebrew-HTB/he_htb-ud-test.conllu"
"UD_27 test set of HEBREW_HTB."
UD_27_HINDI_HDTB_TRAIN = _UD_27_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-train.conllu"
"UD_27 train set of HINDI_HDTB."
UD_27_HINDI_HDTB_DEV = _UD_27_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu"
"UD_27 dev set of HINDI_HDTB."
UD_27_HINDI_HDTB_TEST = _UD_27_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-test.conllu"
"UD_27 test set of HINDI_HDTB."
UD_27_HINDI_PUD_TEST = _UD_27_HOME + "UD_Hindi-PUD/hi_pud-ud-test.conllu"
"UD_27 test set of HINDI_PUD."
UD_27_HINDI_ENGLISH_HIENCS_TRAIN = _UD_27_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-train.conllu"
"UD_27 train set of HINDI_ENGLISH_HIENCS."
UD_27_HINDI_ENGLISH_HIENCS_DEV = _UD_27_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-dev.conllu"
"UD_27 dev set of HINDI_ENGLISH_HIENCS."
UD_27_HINDI_ENGLISH_HIENCS_TEST = _UD_27_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-test.conllu"
"UD_27 test set of HINDI_ENGLISH_HIENCS."
UD_27_HUNGARIAN_SZEGED_TRAIN = _UD_27_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-train.conllu"
"UD_27 train set of HUNGARIAN_SZEGED."
UD_27_HUNGARIAN_SZEGED_DEV = _UD_27_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu"
"UD_27 dev set of HUNGARIAN_SZEGED."
UD_27_HUNGARIAN_SZEGED_TEST = _UD_27_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-test.conllu"
"UD_27 test set of HUNGARIAN_SZEGED."
UD_27_ICELANDIC_ICEPAHC_TRAIN = _UD_27_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-train.conllu"
"UD_27 train set of ICELANDIC_ICEPAHC."
UD_27_ICELANDIC_ICEPAHC_DEV = _UD_27_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-dev.conllu"
"UD_27 dev set of ICELANDIC_ICEPAHC."
UD_27_ICELANDIC_ICEPAHC_TEST = _UD_27_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-test.conllu"
"UD_27 test set of ICELANDIC_ICEPAHC."
UD_27_ICELANDIC_PUD_TEST = _UD_27_HOME + "UD_Icelandic-PUD/is_pud-ud-test.conllu"
"UD_27 test set of ICELANDIC_PUD."
UD_27_INDONESIAN_CSUI_TRAIN = _UD_27_HOME + "UD_Indonesian-CSUI/id_csui-ud-train.conllu"
"UD_27 train set of INDONESIAN_CSUI."
UD_27_INDONESIAN_CSUI_TEST = _UD_27_HOME + "UD_Indonesian-CSUI/id_csui-ud-test.conllu"
"UD_27 test set of INDONESIAN_CSUI."
UD_27_INDONESIAN_GSD_TRAIN = _UD_27_HOME + "UD_Indonesian-GSD/id_gsd-ud-train.conllu"
"UD_27 train set of INDONESIAN_GSD."
UD_27_INDONESIAN_GSD_DEV = _UD_27_HOME + "UD_Indonesian-GSD/id_gsd-ud-dev.conllu"
"UD_27 dev set of INDONESIAN_GSD."
UD_27_INDONESIAN_GSD_TEST = _UD_27_HOME + "UD_Indonesian-GSD/id_gsd-ud-test.conllu"
"UD_27 test set of INDONESIAN_GSD."
UD_27_INDONESIAN_PUD_TEST = _UD_27_HOME + "UD_Indonesian-PUD/id_pud-ud-test.conllu"
"UD_27 test set of INDONESIAN_PUD."
UD_27_IRISH_IDT_TRAIN = _UD_27_HOME + "UD_Irish-IDT/ga_idt-ud-train.conllu"
"UD_27 train set of IRISH_IDT."
UD_27_IRISH_IDT_DEV = _UD_27_HOME + "UD_Irish-IDT/ga_idt-ud-dev.conllu"
"UD_27 dev set of IRISH_IDT."
UD_27_IRISH_IDT_TEST = _UD_27_HOME + "UD_Irish-IDT/ga_idt-ud-test.conllu"
"UD_27 test set of IRISH_IDT."
UD_27_ITALIAN_ISDT_TRAIN = _UD_27_HOME + "UD_Italian-ISDT/it_isdt-ud-train.conllu"
"UD_27 train set of ITALIAN_ISDT."
UD_27_ITALIAN_ISDT_DEV = _UD_27_HOME + "UD_Italian-ISDT/it_isdt-ud-dev.conllu"
"UD_27 dev set of ITALIAN_ISDT."
UD_27_ITALIAN_ISDT_TEST = _UD_27_HOME + "UD_Italian-ISDT/it_isdt-ud-test.conllu"
"UD_27 test set of ITALIAN_ISDT."
UD_27_ITALIAN_PUD_TEST = _UD_27_HOME + "UD_Italian-PUD/it_pud-ud-test.conllu"
"UD_27 test set of ITALIAN_PUD."
UD_27_ITALIAN_PARTUT_TRAIN = _UD_27_HOME + "UD_Italian-ParTUT/it_partut-ud-train.conllu"
"UD_27 train set of ITALIAN_PARTUT."
UD_27_ITALIAN_PARTUT_DEV = _UD_27_HOME + "UD_Italian-ParTUT/it_partut-ud-dev.conllu"
"UD_27 dev set of ITALIAN_PARTUT."
UD_27_ITALIAN_PARTUT_TEST = _UD_27_HOME + "UD_Italian-ParTUT/it_partut-ud-test.conllu"
"UD_27 test set of ITALIAN_PARTUT."
UD_27_ITALIAN_POSTWITA_TRAIN = _UD_27_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-train.conllu"
"UD_27 train set of ITALIAN_POSTWITA."
UD_27_ITALIAN_POSTWITA_DEV = _UD_27_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu"
"UD_27 dev set of ITALIAN_POSTWITA."
UD_27_ITALIAN_POSTWITA_TEST = _UD_27_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-test.conllu"
"UD_27 test set of ITALIAN_POSTWITA."
UD_27_ITALIAN_TWITTIRO_TRAIN = _UD_27_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-train.conllu"
"UD_27 train set of ITALIAN_TWITTIRO."
UD_27_ITALIAN_TWITTIRO_DEV = _UD_27_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-dev.conllu"
"UD_27 dev set of ITALIAN_TWITTIRO."
UD_27_ITALIAN_TWITTIRO_TEST = _UD_27_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-test.conllu"
"UD_27 test set of ITALIAN_TWITTIRO."
UD_27_ITALIAN_VIT_TRAIN = _UD_27_HOME + "UD_Italian-VIT/it_vit-ud-train.conllu"
"UD_27 train set of ITALIAN_VIT."
UD_27_ITALIAN_VIT_DEV = _UD_27_HOME + "UD_Italian-VIT/it_vit-ud-dev.conllu"
"UD_27 dev set of ITALIAN_VIT."
UD_27_ITALIAN_VIT_TEST = _UD_27_HOME + "UD_Italian-VIT/it_vit-ud-test.conllu"
"UD_27 test set of ITALIAN_VIT."
UD_27_JAPANESE_BCCWJ_TRAIN = _UD_27_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-train.conllu"
"UD_27 train set of JAPANESE_BCCWJ."
UD_27_JAPANESE_BCCWJ_DEV = _UD_27_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-dev.conllu"
"UD_27 dev set of JAPANESE_BCCWJ."
UD_27_JAPANESE_BCCWJ_TEST = _UD_27_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-test.conllu"
"UD_27 test set of JAPANESE_BCCWJ."
UD_27_JAPANESE_GSD_TRAIN = _UD_27_HOME + "UD_Japanese-GSD/ja_gsd-ud-train.conllu"
"UD_27 train set of JAPANESE_GSD."
UD_27_JAPANESE_GSD_DEV = _UD_27_HOME + "UD_Japanese-GSD/ja_gsd-ud-dev.conllu"
"UD_27 dev set of JAPANESE_GSD."
UD_27_JAPANESE_GSD_TEST = _UD_27_HOME + "UD_Japanese-GSD/ja_gsd-ud-test.conllu"
"UD_27 test set of JAPANESE_GSD."
UD_27_JAPANESE_MODERN_TEST = _UD_27_HOME + "UD_Japanese-Modern/ja_modern-ud-test.conllu"
"UD_27 test set of JAPANESE_MODERN."
UD_27_JAPANESE_PUD_TEST = _UD_27_HOME + "UD_Japanese-PUD/ja_pud-ud-test.conllu"
"UD_27 test set of JAPANESE_PUD."
UD_27_KARELIAN_KKPP_TEST = _UD_27_HOME + "UD_Karelian-KKPP/krl_kkpp-ud-test.conllu"
"UD_27 test set of KARELIAN_KKPP."
UD_27_KAZAKH_KTB_TRAIN = _UD_27_HOME + "UD_Kazakh-KTB/kk_ktb-ud-train.conllu"
"UD_27 train set of KAZAKH_KTB."
UD_27_KAZAKH_KTB_TEST = _UD_27_HOME + "UD_Kazakh-KTB/kk_ktb-ud-test.conllu"
"UD_27 test set of KAZAKH_KTB."
UD_27_KHUNSARI_AHA_TEST = _UD_27_HOME + "UD_Khunsari-AHA/kfm_aha-ud-test.conllu"
"UD_27 test set of KHUNSARI_AHA."
UD_27_KOMI_PERMYAK_UH_TEST = _UD_27_HOME + "UD_Komi_Permyak-UH/koi_uh-ud-test.conllu"
"UD_27 test set of KOMI_PERMYAK_UH."
UD_27_KOMI_ZYRIAN_IKDP_TEST = _UD_27_HOME + "UD_Komi_Zyrian-IKDP/kpv_ikdp-ud-test.conllu"
"UD_27 test set of KOMI_ZYRIAN_IKDP."
UD_27_KOMI_ZYRIAN_LATTICE_TEST = _UD_27_HOME + "UD_Komi_Zyrian-Lattice/kpv_lattice-ud-test.conllu"
"UD_27 test set of KOMI_ZYRIAN_LATTICE."
UD_27_KOREAN_GSD_TRAIN = _UD_27_HOME + "UD_Korean-GSD/ko_gsd-ud-train.conllu"
"UD_27 train set of KOREAN_GSD."
UD_27_KOREAN_GSD_DEV = _UD_27_HOME + "UD_Korean-GSD/ko_gsd-ud-dev.conllu"
"UD_27 dev set of KOREAN_GSD."
UD_27_KOREAN_GSD_TEST = _UD_27_HOME + "UD_Korean-GSD/ko_gsd-ud-test.conllu"
"UD_27 test set of KOREAN_GSD."
UD_27_KOREAN_KAIST_TRAIN = _UD_27_HOME + "UD_Korean-Kaist/ko_kaist-ud-train.conllu"
"UD_27 train set of KOREAN_KAIST."
UD_27_KOREAN_KAIST_DEV = _UD_27_HOME + "UD_Korean-Kaist/ko_kaist-ud-dev.conllu"
"UD_27 dev set of KOREAN_KAIST."
UD_27_KOREAN_KAIST_TEST = _UD_27_HOME + "UD_Korean-Kaist/ko_kaist-ud-test.conllu"
"UD_27 test set of KOREAN_KAIST."
UD_27_KOREAN_PUD_TEST = _UD_27_HOME + "UD_Korean-PUD/ko_pud-ud-test.conllu"
"UD_27 test set of KOREAN_PUD."
UD_27_KURMANJI_MG_TRAIN = _UD_27_HOME + "UD_Kurmanji-MG/kmr_mg-ud-train.conllu"
"UD_27 train set of KURMANJI_MG."
UD_27_KURMANJI_MG_TEST = _UD_27_HOME + "UD_Kurmanji-MG/kmr_mg-ud-test.conllu"
"UD_27 test set of KURMANJI_MG."
UD_27_LATIN_ITTB_TRAIN = _UD_27_HOME + "UD_Latin-ITTB/la_ittb-ud-train.conllu"
"UD_27 train set of LATIN_ITTB."
UD_27_LATIN_ITTB_DEV = _UD_27_HOME + "UD_Latin-ITTB/la_ittb-ud-dev.conllu"
"UD_27 dev set of LATIN_ITTB."
UD_27_LATIN_ITTB_TEST = _UD_27_HOME + "UD_Latin-ITTB/la_ittb-ud-test.conllu"
"UD_27 test set of LATIN_ITTB."
UD_27_LATIN_LLCT_TRAIN = _UD_27_HOME + "UD_Latin-LLCT/la_llct-ud-train.conllu"
"UD_27 train set of LATIN_LLCT."
UD_27_LATIN_LLCT_DEV = _UD_27_HOME + "UD_Latin-LLCT/la_llct-ud-dev.conllu"
"UD_27 dev set of LATIN_LLCT."
UD_27_LATIN_LLCT_TEST = _UD_27_HOME + "UD_Latin-LLCT/la_llct-ud-test.conllu"
"UD_27 test set of LATIN_LLCT."
UD_27_LATIN_PROIEL_TRAIN = _UD_27_HOME + "UD_Latin-PROIEL/la_proiel-ud-train.conllu"
"UD_27 train set of LATIN_PROIEL."
UD_27_LATIN_PROIEL_DEV = _UD_27_HOME + "UD_Latin-PROIEL/la_proiel-ud-dev.conllu"
"UD_27 dev set of LATIN_PROIEL."
UD_27_LATIN_PROIEL_TEST = _UD_27_HOME + "UD_Latin-PROIEL/la_proiel-ud-test.conllu"
"UD_27 test set of LATIN_PROIEL."
UD_27_LATIN_PERSEUS_TRAIN = _UD_27_HOME + "UD_Latin-Perseus/la_perseus-ud-train.conllu"
"UD_27 train set of LATIN_PERSEUS."
UD_27_LATIN_PERSEUS_TEST = _UD_27_HOME + "UD_Latin-Perseus/la_perseus-ud-test.conllu"
"UD_27 test set of LATIN_PERSEUS."
UD_27_LATVIAN_LVTB_TRAIN = _UD_27_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-train.conllu"
"UD_27 train set of LATVIAN_LVTB."
UD_27_LATVIAN_LVTB_DEV = _UD_27_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu"
"UD_27 dev set of LATVIAN_LVTB."
UD_27_LATVIAN_LVTB_TEST = _UD_27_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-test.conllu"
"UD_27 test set of LATVIAN_LVTB."
UD_27_LITHUANIAN_ALKSNIS_TRAIN = _UD_27_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-train.conllu"
"UD_27 train set of LITHUANIAN_ALKSNIS."
UD_27_LITHUANIAN_ALKSNIS_DEV = _UD_27_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-dev.conllu"
"UD_27 dev set of LITHUANIAN_ALKSNIS."
UD_27_LITHUANIAN_ALKSNIS_TEST = _UD_27_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-test.conllu"
"UD_27 test set of LITHUANIAN_ALKSNIS."
UD_27_LITHUANIAN_HSE_TRAIN = _UD_27_HOME + "UD_Lithuanian-HSE/lt_hse-ud-train.conllu"
"UD_27 train set of LITHUANIAN_HSE."
UD_27_LITHUANIAN_HSE_DEV = _UD_27_HOME + "UD_Lithuanian-HSE/lt_hse-ud-dev.conllu"
"UD_27 dev set of LITHUANIAN_HSE."
UD_27_LITHUANIAN_HSE_TEST = _UD_27_HOME + "UD_Lithuanian-HSE/lt_hse-ud-test.conllu"
"UD_27 test set of LITHUANIAN_HSE."
UD_27_LIVVI_KKPP_TRAIN = _UD_27_HOME + "UD_Livvi-KKPP/olo_kkpp-ud-train.conllu"
"UD_27 train set of LIVVI_KKPP."
UD_27_LIVVI_KKPP_TEST = _UD_27_HOME + "UD_Livvi-KKPP/olo_kkpp-ud-test.conllu"
"UD_27 test set of LIVVI_KKPP."
UD_27_MALTESE_MUDT_TRAIN = _UD_27_HOME + "UD_Maltese-MUDT/mt_mudt-ud-train.conllu"
"UD_27 train set of MALTESE_MUDT."
UD_27_MALTESE_MUDT_DEV = _UD_27_HOME + "UD_Maltese-MUDT/mt_mudt-ud-dev.conllu"
"UD_27 dev set of MALTESE_MUDT."
UD_27_MALTESE_MUDT_TEST = _UD_27_HOME + "UD_Maltese-MUDT/mt_mudt-ud-test.conllu"
"UD_27 test set of MALTESE_MUDT."
UD_27_MANX_CADHAN_TEST = _UD_27_HOME + "UD_Manx-Cadhan/gv_cadhan-ud-test.conllu"
"UD_27 test set of MANX_CADHAN."
UD_27_MARATHI_UFAL_TRAIN = _UD_27_HOME + "UD_Marathi-UFAL/mr_ufal-ud-train.conllu"
"UD_27 train set of MARATHI_UFAL."
UD_27_MARATHI_UFAL_DEV = _UD_27_HOME + "UD_Marathi-UFAL/mr_ufal-ud-dev.conllu"
"UD_27 dev set of MARATHI_UFAL."
UD_27_MARATHI_UFAL_TEST = _UD_27_HOME + "UD_Marathi-UFAL/mr_ufal-ud-test.conllu"
"UD_27 test set of MARATHI_UFAL."
UD_27_MBYA_GUARANI_DOOLEY_TEST = _UD_27_HOME + "UD_Mbya_Guarani-Dooley/gun_dooley-ud-test.conllu"
"UD_27 test set of MBYA_GUARANI_DOOLEY."
UD_27_MBYA_GUARANI_THOMAS_TEST = _UD_27_HOME + "UD_Mbya_Guarani-Thomas/gun_thomas-ud-test.conllu"
"UD_27 test set of MBYA_GUARANI_THOMAS."
UD_27_MOKSHA_JR_TEST = _UD_27_HOME + "UD_Moksha-JR/mdf_jr-ud-test.conllu"
"UD_27 test set of MOKSHA_JR."
UD_27_MUNDURUKU_TUDET_TEST = _UD_27_HOME + "UD_Munduruku-TuDeT/myu_tudet-ud-test.conllu"
"UD_27 test set of MUNDURUKU_TUDET."
UD_27_NAIJA_NSC_TRAIN = _UD_27_HOME + "UD_Naija-NSC/pcm_nsc-ud-train.conllu"
"UD_27 train set of NAIJA_NSC."
UD_27_NAIJA_NSC_DEV = _UD_27_HOME + "UD_Naija-NSC/pcm_nsc-ud-dev.conllu"
"UD_27 dev set of NAIJA_NSC."
UD_27_NAIJA_NSC_TEST = _UD_27_HOME + "UD_Naija-NSC/pcm_nsc-ud-test.conllu"
"UD_27 test set of NAIJA_NSC."
UD_27_NAYINI_AHA_TEST = _UD_27_HOME + "UD_Nayini-AHA/nyq_aha-ud-test.conllu"
"UD_27 test set of NAYINI_AHA."
UD_27_NORTH_SAMI_GIELLA_TRAIN = _UD_27_HOME + "UD_North_Sami-Giella/sme_giella-ud-train.conllu"
"UD_27 train set of NORTH_SAMI_GIELLA."
UD_27_NORTH_SAMI_GIELLA_TEST = _UD_27_HOME + "UD_North_Sami-Giella/sme_giella-ud-test.conllu"
"UD_27 test set of NORTH_SAMI_GIELLA."
UD_27_NORWEGIAN_BOKMAAL_TRAIN = _UD_27_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu"
"UD_27 train set of NORWEGIAN_BOKMAAL."
UD_27_NORWEGIAN_BOKMAAL_DEV = _UD_27_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu"
"UD_27 dev set of NORWEGIAN_BOKMAAL."
UD_27_NORWEGIAN_BOKMAAL_TEST = _UD_27_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-test.conllu"
"UD_27 test set of NORWEGIAN_BOKMAAL."
UD_27_NORWEGIAN_NYNORSK_TRAIN = _UD_27_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu"
"UD_27 train set of NORWEGIAN_NYNORSK."
UD_27_NORWEGIAN_NYNORSK_DEV = _UD_27_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu"
"UD_27 dev set of NORWEGIAN_NYNORSK."
UD_27_NORWEGIAN_NYNORSK_TEST = _UD_27_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-test.conllu"
"UD_27 test set of NORWEGIAN_NYNORSK."
UD_27_NORWEGIAN_NYNORSKLIA_TRAIN = _UD_27_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu"
"UD_27 train set of NORWEGIAN_NYNORSKLIA."
UD_27_NORWEGIAN_NYNORSKLIA_DEV = _UD_27_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-dev.conllu"
"UD_27 dev set of NORWEGIAN_NYNORSKLIA."
UD_27_NORWEGIAN_NYNORSKLIA_TEST = _UD_27_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-test.conllu"
"UD_27 test set of NORWEGIAN_NYNORSKLIA."
UD_27_OLD_CHURCH_SLAVONIC_PROIEL_TRAIN = _UD_27_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu"
"UD_27 train set of OLD_CHURCH_SLAVONIC_PROIEL."
UD_27_OLD_CHURCH_SLAVONIC_PROIEL_DEV = _UD_27_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu"
"UD_27 dev set of OLD_CHURCH_SLAVONIC_PROIEL."
UD_27_OLD_CHURCH_SLAVONIC_PROIEL_TEST = _UD_27_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-test.conllu"
"UD_27 test set of OLD_CHURCH_SLAVONIC_PROIEL."
UD_27_OLD_FRENCH_SRCMF_TRAIN = _UD_27_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu"
"UD_27 train set of OLD_FRENCH_SRCMF."
UD_27_OLD_FRENCH_SRCMF_DEV = _UD_27_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu"
"UD_27 dev set of OLD_FRENCH_SRCMF."
UD_27_OLD_FRENCH_SRCMF_TEST = _UD_27_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-test.conllu"
"UD_27 test set of OLD_FRENCH_SRCMF."
UD_27_OLD_RUSSIAN_RNC_TRAIN = _UD_27_HOME + "UD_Old_Russian-RNC/orv_rnc-ud-train.conllu"
"UD_27 train set of OLD_RUSSIAN_RNC."
UD_27_OLD_RUSSIAN_RNC_TEST = _UD_27_HOME + "UD_Old_Russian-RNC/orv_rnc-ud-test.conllu"
"UD_27 test set of OLD_RUSSIAN_RNC."
UD_27_OLD_RUSSIAN_TOROT_TRAIN = _UD_27_HOME + "UD_Old_Russian-TOROT/orv_torot-ud-train.conllu"
"UD_27 train set of OLD_RUSSIAN_TOROT."
UD_27_OLD_RUSSIAN_TOROT_DEV = _UD_27_HOME + "UD_Old_Russian-TOROT/orv_torot-ud-dev.conllu"
"UD_27 dev set of OLD_RUSSIAN_TOROT."
UD_27_OLD_RUSSIAN_TOROT_TEST = _UD_27_HOME + "UD_Old_Russian-TOROT/orv_torot-ud-test.conllu"
"UD_27 test set of OLD_RUSSIAN_TOROT."
UD_27_OLD_TURKISH_TONQQ_TEST = _UD_27_HOME + "UD_Old_Turkish-Tonqq/otk_tonqq-ud-test.conllu"
"UD_27 test set of OLD_TURKISH_TONQQ."
UD_27_PERSIAN_PERDT_TRAIN = _UD_27_HOME + "UD_Persian-PerDT/fa_perdt-ud-train.conllu"
"UD_27 train set of PERSIAN_PERDT."
UD_27_PERSIAN_PERDT_DEV = _UD_27_HOME + "UD_Persian-PerDT/fa_perdt-ud-dev.conllu"
"UD_27 dev set of PERSIAN_PERDT."
UD_27_PERSIAN_PERDT_TEST = _UD_27_HOME + "UD_Persian-PerDT/fa_perdt-ud-test.conllu"
"UD_27 test set of PERSIAN_PERDT."
UD_27_PERSIAN_SERAJI_TRAIN = _UD_27_HOME + "UD_Persian-Seraji/fa_seraji-ud-train.conllu"
"UD_27 train set of PERSIAN_SERAJI."
UD_27_PERSIAN_SERAJI_DEV = _UD_27_HOME + "UD_Persian-Seraji/fa_seraji-ud-dev.conllu"
"UD_27 dev set of PERSIAN_SERAJI."
UD_27_PERSIAN_SERAJI_TEST = _UD_27_HOME + "UD_Persian-Seraji/fa_seraji-ud-test.conllu"
"UD_27 test set of PERSIAN_SERAJI."
UD_27_POLISH_LFG_TRAIN = _UD_27_HOME + "UD_Polish-LFG/pl_lfg-ud-train.conllu"
"UD_27 train set of POLISH_LFG."
UD_27_POLISH_LFG_DEV = _UD_27_HOME + "UD_Polish-LFG/pl_lfg-ud-dev.conllu"
"UD_27 dev set of POLISH_LFG."
UD_27_POLISH_LFG_TEST = _UD_27_HOME + "UD_Polish-LFG/pl_lfg-ud-test.conllu"
"UD_27 test set of POLISH_LFG."
UD_27_POLISH_PDB_TRAIN = _UD_27_HOME + "UD_Polish-PDB/pl_pdb-ud-train.conllu"
"UD_27 train set of POLISH_PDB."
UD_27_POLISH_PDB_DEV = _UD_27_HOME + "UD_Polish-PDB/pl_pdb-ud-dev.conllu"
"UD_27 dev set of POLISH_PDB."
UD_27_POLISH_PDB_TEST = _UD_27_HOME + "UD_Polish-PDB/pl_pdb-ud-test.conllu"
"UD_27 test set of POLISH_PDB."
UD_27_POLISH_PUD_TEST = _UD_27_HOME + "UD_Polish-PUD/pl_pud-ud-test.conllu"
"UD_27 test set of POLISH_PUD."
UD_27_PORTUGUESE_BOSQUE_TRAIN = _UD_27_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-train.conllu"
"UD_27 train set of PORTUGUESE_BOSQUE."
UD_27_PORTUGUESE_BOSQUE_DEV = _UD_27_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu"
"UD_27 dev set of PORTUGUESE_BOSQUE."
UD_27_PORTUGUESE_BOSQUE_TEST = _UD_27_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-test.conllu"
"UD_27 test set of PORTUGUESE_BOSQUE."
UD_27_PORTUGUESE_GSD_TRAIN = _UD_27_HOME + "UD_Portuguese-GSD/pt_gsd-ud-train.conllu"
"UD_27 train set of PORTUGUESE_GSD."
UD_27_PORTUGUESE_GSD_DEV = _UD_27_HOME + "UD_Portuguese-GSD/pt_gsd-ud-dev.conllu"
"UD_27 dev set of PORTUGUESE_GSD."
UD_27_PORTUGUESE_GSD_TEST = _UD_27_HOME + "UD_Portuguese-GSD/pt_gsd-ud-test.conllu"
"UD_27 test set of PORTUGUESE_GSD."
UD_27_PORTUGUESE_PUD_TEST = _UD_27_HOME + "UD_Portuguese-PUD/pt_pud-ud-test.conllu"
"UD_27 test set of PORTUGUESE_PUD."
UD_27_ROMANIAN_NONSTANDARD_TRAIN = _UD_27_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu"
"UD_27 train set of ROMANIAN_NONSTANDARD."
UD_27_ROMANIAN_NONSTANDARD_DEV = _UD_27_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu"
"UD_27 dev set of ROMANIAN_NONSTANDARD."
UD_27_ROMANIAN_NONSTANDARD_TEST = _UD_27_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-test.conllu"
"UD_27 test set of ROMANIAN_NONSTANDARD."
UD_27_ROMANIAN_RRT_TRAIN = _UD_27_HOME + "UD_Romanian-RRT/ro_rrt-ud-train.conllu"
"UD_27 train set of ROMANIAN_RRT."
UD_27_ROMANIAN_RRT_DEV = _UD_27_HOME + "UD_Romanian-RRT/ro_rrt-ud-dev.conllu"
"UD_27 dev set of ROMANIAN_RRT."
UD_27_ROMANIAN_RRT_TEST = _UD_27_HOME + "UD_Romanian-RRT/ro_rrt-ud-test.conllu"
"UD_27 test set of ROMANIAN_RRT."
UD_27_ROMANIAN_SIMONERO_TRAIN = _UD_27_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-train.conllu"
"UD_27 train set of ROMANIAN_SIMONERO."
UD_27_ROMANIAN_SIMONERO_DEV = _UD_27_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-dev.conllu"
"UD_27 dev set of ROMANIAN_SIMONERO."
UD_27_ROMANIAN_SIMONERO_TEST = _UD_27_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-test.conllu"
"UD_27 test set of ROMANIAN_SIMONERO."
UD_27_RUSSIAN_GSD_TRAIN = _UD_27_HOME + "UD_Russian-GSD/ru_gsd-ud-train.conllu"
"UD_27 train set of RUSSIAN_GSD."
UD_27_RUSSIAN_GSD_DEV = _UD_27_HOME + "UD_Russian-GSD/ru_gsd-ud-dev.conllu"
"UD_27 dev set of RUSSIAN_GSD."
UD_27_RUSSIAN_GSD_TEST = _UD_27_HOME + "UD_Russian-GSD/ru_gsd-ud-test.conllu"
"UD_27 test set of RUSSIAN_GSD."
UD_27_RUSSIAN_PUD_TEST = _UD_27_HOME + "UD_Russian-PUD/ru_pud-ud-test.conllu"
"UD_27 test set of RUSSIAN_PUD."
UD_27_RUSSIAN_SYNTAGRUS_TRAIN = _UD_27_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu"
"UD_27 train set of RUSSIAN_SYNTAGRUS."
UD_27_RUSSIAN_SYNTAGRUS_DEV = _UD_27_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu"
"UD_27 dev set of RUSSIAN_SYNTAGRUS."
UD_27_RUSSIAN_SYNTAGRUS_TEST = _UD_27_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu"
"UD_27 test set of RUSSIAN_SYNTAGRUS."
UD_27_RUSSIAN_TAIGA_TRAIN = _UD_27_HOME + "UD_Russian-Taiga/ru_taiga-ud-train.conllu"
"UD_27 train set of RUSSIAN_TAIGA."
UD_27_RUSSIAN_TAIGA_DEV = _UD_27_HOME + "UD_Russian-Taiga/ru_taiga-ud-dev.conllu"
"UD_27 dev set of RUSSIAN_TAIGA."
UD_27_RUSSIAN_TAIGA_TEST = _UD_27_HOME + "UD_Russian-Taiga/ru_taiga-ud-test.conllu"
"UD_27 test set of RUSSIAN_TAIGA."
UD_27_SANSKRIT_UFAL_TEST = _UD_27_HOME + "UD_Sanskrit-UFAL/sa_ufal-ud-test.conllu"
"UD_27 test set of SANSKRIT_UFAL."
UD_27_SANSKRIT_VEDIC_TRAIN = _UD_27_HOME + "UD_Sanskrit-Vedic/sa_vedic-ud-train.conllu"
"UD_27 train set of SANSKRIT_VEDIC."
UD_27_SANSKRIT_VEDIC_TEST = _UD_27_HOME + "UD_Sanskrit-Vedic/sa_vedic-ud-test.conllu"
"UD_27 test set of SANSKRIT_VEDIC."
UD_27_SCOTTISH_GAELIC_ARCOSG_TRAIN = _UD_27_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-train.conllu"
"UD_27 train set of SCOTTISH_GAELIC_ARCOSG."
UD_27_SCOTTISH_GAELIC_ARCOSG_DEV = _UD_27_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-dev.conllu"
"UD_27 dev set of SCOTTISH_GAELIC_ARCOSG."
UD_27_SCOTTISH_GAELIC_ARCOSG_TEST = _UD_27_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-test.conllu"
"UD_27 test set of SCOTTISH_GAELIC_ARCOSG."
UD_27_SERBIAN_SET_TRAIN = _UD_27_HOME + "UD_Serbian-SET/sr_set-ud-train.conllu"
"UD_27 train set of SERBIAN_SET."
UD_27_SERBIAN_SET_DEV = _UD_27_HOME + "UD_Serbian-SET/sr_set-ud-dev.conllu"
"UD_27 dev set of SERBIAN_SET."
UD_27_SERBIAN_SET_TEST = _UD_27_HOME + "UD_Serbian-SET/sr_set-ud-test.conllu"
"UD_27 test set of SERBIAN_SET."
UD_27_SKOLT_SAMI_GIELLAGAS_TEST = _UD_27_HOME + "UD_Skolt_Sami-Giellagas/sms_giellagas-ud-test.conllu"
"UD_27 test set of SKOLT_SAMI_GIELLAGAS."
UD_27_SLOVAK_SNK_TRAIN = _UD_27_HOME + "UD_Slovak-SNK/sk_snk-ud-train.conllu"
"UD_27 train set of SLOVAK_SNK."
UD_27_SLOVAK_SNK_DEV = _UD_27_HOME + "UD_Slovak-SNK/sk_snk-ud-dev.conllu"
"UD_27 dev set of SLOVAK_SNK."
UD_27_SLOVAK_SNK_TEST = _UD_27_HOME + "UD_Slovak-SNK/sk_snk-ud-test.conllu"
"UD_27 test set of SLOVAK_SNK."
UD_27_SLOVENIAN_SSJ_TRAIN = _UD_27_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-train.conllu"
"UD_27 train set of SLOVENIAN_SSJ."
UD_27_SLOVENIAN_SSJ_DEV = _UD_27_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu"
"UD_27 dev set of SLOVENIAN_SSJ."
UD_27_SLOVENIAN_SSJ_TEST = _UD_27_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-test.conllu"
"UD_27 test set of SLOVENIAN_SSJ."
UD_27_SLOVENIAN_SST_TRAIN = _UD_27_HOME + "UD_Slovenian-SST/sl_sst-ud-train.conllu"
"UD_27 train set of SLOVENIAN_SST."
UD_27_SLOVENIAN_SST_TEST = _UD_27_HOME + "UD_Slovenian-SST/sl_sst-ud-test.conllu"
"UD_27 test set of SLOVENIAN_SST."
UD_27_SOI_AHA_TEST = _UD_27_HOME + "UD_Soi-AHA/soj_aha-ud-test.conllu"
"UD_27 test set of SOI_AHA."
UD_27_SOUTH_LEVANTINE_ARABIC_MADAR_TEST = _UD_27_HOME + "UD_South_Levantine_Arabic-MADAR/ajp_madar-ud-test.conllu"
"UD_27 test set of SOUTH_LEVANTINE_ARABIC_MADAR."
UD_27_SPANISH_ANCORA_TRAIN = _UD_27_HOME + "UD_Spanish-AnCora/es_ancora-ud-train.conllu"
"UD_27 train set of SPANISH_ANCORA."
UD_27_SPANISH_ANCORA_DEV = _UD_27_HOME + "UD_Spanish-AnCora/es_ancora-ud-dev.conllu"
"UD_27 dev set of SPANISH_ANCORA."
UD_27_SPANISH_ANCORA_TEST = _UD_27_HOME + "UD_Spanish-AnCora/es_ancora-ud-test.conllu"
"UD_27 test set of SPANISH_ANCORA."
UD_27_SPANISH_GSD_TRAIN = _UD_27_HOME + "UD_Spanish-GSD/es_gsd-ud-train.conllu"
"UD_27 train set of SPANISH_GSD."
UD_27_SPANISH_GSD_DEV = _UD_27_HOME + "UD_Spanish-GSD/es_gsd-ud-dev.conllu"
"UD_27 dev set of SPANISH_GSD."
UD_27_SPANISH_GSD_TEST = _UD_27_HOME + "UD_Spanish-GSD/es_gsd-ud-test.conllu"
"UD_27 test set of SPANISH_GSD."
UD_27_SPANISH_PUD_TEST = _UD_27_HOME + "UD_Spanish-PUD/es_pud-ud-test.conllu"
"UD_27 test set of SPANISH_PUD."
UD_27_SWEDISH_LINES_TRAIN = _UD_27_HOME + "UD_Swedish-LinES/sv_lines-ud-train.conllu"
"UD_27 train set of SWEDISH_LINES."
UD_27_SWEDISH_LINES_DEV = _UD_27_HOME + "UD_Swedish-LinES/sv_lines-ud-dev.conllu"
"UD_27 dev set of SWEDISH_LINES."
UD_27_SWEDISH_LINES_TEST = _UD_27_HOME + "UD_Swedish-LinES/sv_lines-ud-test.conllu"
"UD_27 test set of SWEDISH_LINES."
UD_27_SWEDISH_PUD_TEST = _UD_27_HOME + "UD_Swedish-PUD/sv_pud-ud-test.conllu"
"UD_27 test set of SWEDISH_PUD."
UD_27_SWEDISH_TALBANKEN_TRAIN = _UD_27_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu"
"UD_27 train set of SWEDISH_TALBANKEN."
UD_27_SWEDISH_TALBANKEN_DEV = _UD_27_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu"
"UD_27 dev set of SWEDISH_TALBANKEN."
UD_27_SWEDISH_TALBANKEN_TEST = _UD_27_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-test.conllu"
"UD_27 test set of SWEDISH_TALBANKEN."
UD_27_SWEDISH_SIGN_LANGUAGE_SSLC_TRAIN = _UD_27_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu"
"UD_27 train set of SWEDISH_SIGN_LANGUAGE_SSLC."
UD_27_SWEDISH_SIGN_LANGUAGE_SSLC_DEV = _UD_27_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu"
"UD_27 dev set of SWEDISH_SIGN_LANGUAGE_SSLC."
UD_27_SWEDISH_SIGN_LANGUAGE_SSLC_TEST = _UD_27_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-test.conllu"
"UD_27 test set of SWEDISH_SIGN_LANGUAGE_SSLC."
UD_27_SWISS_GERMAN_UZH_TEST = _UD_27_HOME + "UD_Swiss_German-UZH/gsw_uzh-ud-test.conllu"
"UD_27 test set of SWISS_GERMAN_UZH."
UD_27_TAGALOG_TRG_TEST = _UD_27_HOME + "UD_Tagalog-TRG/tl_trg-ud-test.conllu"
"UD_27 test set of TAGALOG_TRG."
UD_27_TAGALOG_UGNAYAN_TEST = _UD_27_HOME + "UD_Tagalog-Ugnayan/tl_ugnayan-ud-test.conllu"
"UD_27 test set of TAGALOG_UGNAYAN."
UD_27_TAMIL_MWTT_TEST = _UD_27_HOME + "UD_Tamil-MWTT/ta_mwtt-ud-test.conllu"
"UD_27 test set of TAMIL_MWTT."
UD_27_TAMIL_TTB_TRAIN = _UD_27_HOME + "UD_Tamil-TTB/ta_ttb-ud-train.conllu"
"UD_27 train set of TAMIL_TTB."
UD_27_TAMIL_TTB_DEV = _UD_27_HOME + "UD_Tamil-TTB/ta_ttb-ud-dev.conllu"
"UD_27 dev set of TAMIL_TTB."
UD_27_TAMIL_TTB_TEST = _UD_27_HOME + "UD_Tamil-TTB/ta_ttb-ud-test.conllu"
"UD_27 test set of TAMIL_TTB."
UD_27_TELUGU_MTG_TRAIN = _UD_27_HOME + "UD_Telugu-MTG/te_mtg-ud-train.conllu"
"UD_27 train set of TELUGU_MTG."
UD_27_TELUGU_MTG_DEV = _UD_27_HOME + "UD_Telugu-MTG/te_mtg-ud-dev.conllu"
"UD_27 dev set of TELUGU_MTG."
UD_27_TELUGU_MTG_TEST = _UD_27_HOME + "UD_Telugu-MTG/te_mtg-ud-test.conllu"
"UD_27 test set of TELUGU_MTG."
UD_27_THAI_PUD_TEST = _UD_27_HOME + "UD_Thai-PUD/th_pud-ud-test.conllu"
"UD_27 test set of THAI_PUD."
UD_27_TUPINAMBA_TUDET_TEST = _UD_27_HOME + "UD_Tupinamba-TuDeT/tpn_tudet-ud-test.conllu"
"UD_27 test set of TUPINAMBA_TUDET."
UD_27_TURKISH_BOUN_TRAIN = _UD_27_HOME + "UD_Turkish-BOUN/tr_boun-ud-train.conllu"
"UD_27 train set of TURKISH_BOUN."
UD_27_TURKISH_BOUN_DEV = _UD_27_HOME + "UD_Turkish-BOUN/tr_boun-ud-dev.conllu"
"UD_27 dev set of TURKISH_BOUN."
UD_27_TURKISH_BOUN_TEST = _UD_27_HOME + "UD_Turkish-BOUN/tr_boun-ud-test.conllu"
"UD_27 test set of TURKISH_BOUN."
UD_27_TURKISH_GB_TEST = _UD_27_HOME + "UD_Turkish-GB/tr_gb-ud-test.conllu"
"UD_27 test set of TURKISH_GB."
UD_27_TURKISH_IMST_TRAIN = _UD_27_HOME + "UD_Turkish-IMST/tr_imst-ud-train.conllu"
"UD_27 train set of TURKISH_IMST."
UD_27_TURKISH_IMST_DEV = _UD_27_HOME + "UD_Turkish-IMST/tr_imst-ud-dev.conllu"
"UD_27 dev set of TURKISH_IMST."
UD_27_TURKISH_IMST_TEST = _UD_27_HOME + "UD_Turkish-IMST/tr_imst-ud-test.conllu"
"UD_27 test set of TURKISH_IMST."
UD_27_TURKISH_PUD_TEST = _UD_27_HOME + "UD_Turkish-PUD/tr_pud-ud-test.conllu"
"UD_27 test set of TURKISH_PUD."
UD_27_TURKISH_GERMAN_SAGT_TRAIN = _UD_27_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-train.conllu"
"UD_27 train set of TURKISH_GERMAN_SAGT."
UD_27_TURKISH_GERMAN_SAGT_DEV = _UD_27_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-dev.conllu"
"UD_27 dev set of TURKISH_GERMAN_SAGT."
UD_27_TURKISH_GERMAN_SAGT_TEST = _UD_27_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-test.conllu"
"UD_27 test set of TURKISH_GERMAN_SAGT."
UD_27_UKRAINIAN_IU_TRAIN = _UD_27_HOME + "UD_Ukrainian-IU/uk_iu-ud-train.conllu"
"UD_27 train set of UKRAINIAN_IU."
UD_27_UKRAINIAN_IU_DEV = _UD_27_HOME + "UD_Ukrainian-IU/uk_iu-ud-dev.conllu"
"UD_27 dev set of UKRAINIAN_IU."
UD_27_UKRAINIAN_IU_TEST = _UD_27_HOME + "UD_Ukrainian-IU/uk_iu-ud-test.conllu"
"UD_27 test set of UKRAINIAN_IU."
UD_27_UPPER_SORBIAN_UFAL_TRAIN = _UD_27_HOME + "UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu"
"UD_27 train set of UPPER_SORBIAN_UFAL."
UD_27_UPPER_SORBIAN_UFAL_TEST = _UD_27_HOME + "UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu"
"UD_27 test set of UPPER_SORBIAN_UFAL."
UD_27_URDU_UDTB_TRAIN = _UD_27_HOME + "UD_Urdu-UDTB/ur_udtb-ud-train.conllu"
"UD_27 train set of URDU_UDTB."
UD_27_URDU_UDTB_DEV = _UD_27_HOME + "UD_Urdu-UDTB/ur_udtb-ud-dev.conllu"
"UD_27 dev set of URDU_UDTB."
UD_27_URDU_UDTB_TEST = _UD_27_HOME + "UD_Urdu-UDTB/ur_udtb-ud-test.conllu"
"UD_27 test set of URDU_UDTB."
UD_27_UYGHUR_UDT_TRAIN = _UD_27_HOME + "UD_Uyghur-UDT/ug_udt-ud-train.conllu"
"UD_27 train set of UYGHUR_UDT."
UD_27_UYGHUR_UDT_DEV = _UD_27_HOME + "UD_Uyghur-UDT/ug_udt-ud-dev.conllu"
"UD_27 dev set of UYGHUR_UDT."
UD_27_UYGHUR_UDT_TEST = _UD_27_HOME + "UD_Uyghur-UDT/ug_udt-ud-test.conllu"
"UD_27 test set of UYGHUR_UDT."
UD_27_VIETNAMESE_VTB_TRAIN = _UD_27_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-train.conllu"
"UD_27 train set of VIETNAMESE_VTB."
UD_27_VIETNAMESE_VTB_DEV = _UD_27_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu"
"UD_27 dev set of VIETNAMESE_VTB."
UD_27_VIETNAMESE_VTB_TEST = _UD_27_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-test.conllu"
"UD_27 test set of VIETNAMESE_VTB."
UD_27_WARLPIRI_UFAL_TEST = _UD_27_HOME + "UD_Warlpiri-UFAL/wbp_ufal-ud-test.conllu"
"UD_27 test set of WARLPIRI_UFAL."
UD_27_WELSH_CCG_TRAIN = _UD_27_HOME + "UD_Welsh-CCG/cy_ccg-ud-train.conllu"
"UD_27 train set of WELSH_CCG."
UD_27_WELSH_CCG_TEST = _UD_27_HOME + "UD_Welsh-CCG/cy_ccg-ud-test.conllu"
"UD_27 test set of WELSH_CCG."
UD_27_WOLOF_WTB_TRAIN = _UD_27_HOME + "UD_Wolof-WTB/wo_wtb-ud-train.conllu"
"UD_27 train set of WOLOF_WTB."
UD_27_WOLOF_WTB_DEV = _UD_27_HOME + "UD_Wolof-WTB/wo_wtb-ud-dev.conllu"
"UD_27 dev set of WOLOF_WTB."
UD_27_WOLOF_WTB_TEST = _UD_27_HOME + "UD_Wolof-WTB/wo_wtb-ud-test.conllu"
"UD_27 test set of WOLOF_WTB."
UD_27_YORUBA_YTB_TEST = _UD_27_HOME + "UD_Yoruba-YTB/yo_ytb-ud-test.conllu"
"UD_27 test set of YORUBA_YTB."


================================================
FILE: hanlp/datasets/parsing/ud/ud27m.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-21 20:39
import os

from hanlp.datasets.parsing.ud import concat_treebanks
from hanlp.datasets.parsing.ud.ud27 import _UD_27_HOME

_UD_27_MULTILINGUAL_HOME = concat_treebanks(_UD_27_HOME, '2.7')
UD_27_MULTILINGUAL_TRAIN = os.path.join(_UD_27_MULTILINGUAL_HOME, 'train.conllu')
"Training set of multilingual UD_27 obtained by concatenating all training sets."
UD_27_MULTILINGUAL_DEV = os.path.join(_UD_27_MULTILINGUAL_HOME, 'dev.conllu')
"Dev set of multilingual UD_27 obtained by concatenating all dev sets."
UD_27_MULTILINGUAL_TEST = os.path.join(_UD_27_MULTILINGUAL_HOME, 'test.conllu')
"Test set of multilingual UD_27 obtained by concatenating all test sets."


================================================
FILE: hanlp/datasets/pos/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:50

================================================
FILE: hanlp/datasets/pos/ctb5.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:51

_CTB5_POS_HOME = 'http://file.hankcs.com/corpus/ctb5.1-pos.zip'

CTB5_POS_TRAIN = f'{_CTB5_POS_HOME}#train.tsv'
'''PoS training set for CTB5.'''
CTB5_POS_DEV = f'{_CTB5_POS_HOME}#dev.tsv'
'''PoS dev set for CTB5.'''
CTB5_POS_TEST = f'{_CTB5_POS_HOME}#test.tsv'
'''PoS test set for CTB5.'''


================================================
FILE: hanlp/datasets/qa/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-20 19:17

================================================
FILE: hanlp/datasets/qa/hotpotqa.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-20 19:46
from enum import Enum, auto

import torch
import ujson
from torch.nn.utils.rnn import pad_sequence

from hanlp.common.dataset import TransformableDataset
from hanlp_common.util import merge_list_of_dict

HOTPOT_QA_TRAIN = 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json'
HOTPOT_QA_DISTRACTOR_DEV = 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json'
HOTPOT_QA_FULLWIKI_DEV = 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json'


class HotpotQADataset(TransformableDataset):

    def load_file(self, filepath):
        with open(filepath) as fd:
            return ujson.load(fd)


class BuildGraph(object):

    def __init__(self, dst='graph') -> None:
        super().__init__()
        self.dst = dst

    def __call__(self, sample: dict):
        sample[self.dst] = build_graph(sample)
        return sample


def hotpotqa_collate_fn(samples):
    batch = merge_list_of_dict(samples)
    max_seq_len = len(max([x['graph'] for x in samples], key=len))
    arc = torch.zeros([len(samples), max_seq_len, max_seq_len])
    token_offset = torch.zeros([len(samples), max_seq_len], dtype=torch.long)
    src_mask = torch.zeros([len(samples), max_seq_len], dtype=torch.bool)
    sp_candidate_mask = torch.zeros([len(samples), max_seq_len], dtype=torch.bool)
    sp_label = torch.zeros([len(samples), max_seq_len], dtype=torch.float)
    # sp = torch.zeros([len(samples), max_seq_len], dtype=torch.bool)
    tokens = []
    offset = 0
    for i, sample in enumerate(samples):
        graph = sample['graph']
        for j, u in enumerate(graph):
            u: Vertex = u
            for v in u.to:
                v: Vertex = v
                arc[i, v.id, u.id] = 1
                arc[i, u.id, v.id] = 1
            # record each vertex's token offset
            token_offset[i, u.id] = offset
            src_mask[i, u.id] = True
            sp_candidate_mask[i, u.id] = u.is_sp_root_candidate()
            sp_label[i, u.id] = u.is_sp_root()
            offset += 1
        tokens.extend(sample['token_id'])
    seq_lengths = torch.LongTensor(list(map(len, tokens)))
    tokens = [torch.LongTensor(x) for x in tokens]
    tokens = pad_sequence(tokens, batch_first=True)
    batch['adj'] = arc
    batch['tokens'] = tokens
    batch['src_mask'] = src_mask
    batch['seq_lengths'] = seq_lengths
    batch['token_offset'] = token_offset
    batch['sp_candidate_mask'] = sp_candidate_mask
    batch['sp_label'] = sp_label
    return batch


def flat_sentence(sample: dict) -> dict:
    sample['token'] = token = []
    for sent in sample['parsed_sentences']:
        token.append(['bos'] + [x.lower() for x in sent[0]])
    return sample


def create_sp_label(sample: dict) -> dict:
    sample['sp_label'] = sp_label = []

    def label(title_, index_):
        for t, i in sample['supporting_facts']:
            if t == title_ and i == index_:
                return 1
        return 0

    for context in sample['context']:
        title, sents = context
        for idx, sent in enumerate(sents):
            sp_label.append(label(title, idx))
    assert len(sample['supporting_facts']) == sum(sp_label)
    return sample


class Type(Enum):
    Q_ROOT = auto()
    Q_WORD = auto()
    SP_ROOT = auto()
    SP_WORD = auto()
    NON_SP_ROOT = auto()
    NON_SP_WORD = auto()
    DOCUMENT_TITLE = auto()


class Vertex(object):

    def __init__(self, id, type: Type, text=None) -> None:
        super().__init__()
        self.id = id
        self.type = type
        if not text:
            text = str(type).split('.')[-1]
        self.text = text
        self.to = []
        self.rel = []

    def connect(self, to, rel):
        self.to.append(to)
        self.rel.append(rel)

    def __str__(self) -> str:
        return f'{self.text} {self.id}'

    def __hash__(self) -> int:
        return self.id

    def is_word(self):
        return self.type in {Type.SP_WORD, Type.Q_WORD, Type.NON_SP_WORD}

    def is_question(self):
        return self.type in {Type.Q_ROOT, Type.Q_WORD}

    def is_sp(self):
        return self.type in {Type.SP_ROOT, Type.SP_WORD}

    def is_sp_root(self):
        return self.type in {Type.SP_ROOT}

    def is_sp_root_candidate(self):
        return self.type in {Type.SP_ROOT, Type.NON_SP_ROOT}


def build_graph(each: dict, debug=False):
    raw_sents = []
    raw_sents.append(each['question'])
    sp_idx = set()
    sp_sents = {}
    for sp in each['supporting_facts']:
        title, offset = sp
        ids = sp_sents.get(title, None)
        if ids is None:
            sp_sents[title] = ids = set()
        ids.add(offset)
    idx = 1
    for document in each['context']:
        title, sents = document
        raw_sents += sents
        for i, s in enumerate(sents):
            if title in sp_sents and i in sp_sents[title]:
                sp_idx.add(idx)
            idx += 1
    assert idx == len(raw_sents)
    parsed_sents = each['parsed_sentences']
    assert len(raw_sents) == len(parsed_sents)
    graph = []
    for idx, (raw, sent) in enumerate(zip(raw_sents, parsed_sents)):
        if debug:
            if idx > 1 and idx not in sp_idx:
                continue
        offset = len(graph)
        if idx == 0:
            if debug:
                print(f'Question: {raw}')
            graph.append(Vertex(len(graph), Type.Q_ROOT))
        else:
            if debug:
                if idx in sp_idx:
                    print(f'Supporting Fact: {raw}')
            graph.append(Vertex(len(graph), Type.SP_ROOT if idx in sp_idx else Type.NON_SP_ROOT))
        tokens, heads, deprels = sent
        for t, h, d in zip(tokens, heads, deprels):
            graph.append(
                Vertex(len(graph), (Type.SP_WORD if idx in sp_idx else Type.NON_SP_WORD) if idx else Type.Q_WORD, t))
        for i, (h, d) in enumerate(zip(heads, deprels)):
            graph[offset + h].connect(graph[offset + i + 1], d)
    q_root = graph[0]
    for u in graph:
        if u.type == Type.SP_ROOT or u.type == Type.NON_SP_ROOT:
            q_root.connect(u, 'supporting fact?')
    return graph


================================================
FILE: hanlp/datasets/srl/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-22 19:15


================================================
FILE: hanlp/datasets/srl/loaders/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:05


================================================
FILE: hanlp/datasets/srl/loaders/conll2012.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-22 19:15
import glob
import json
import os
from typing import Union, List, Callable

from hanlp.utils.span_util import enumerate_spans

from hanlp.common.dataset import TransformableDataset
from hanlp.common.transform import NamedTransform
from hanlp.utils.io_util import read_tsv_as_sents, get_resource, TimingFileIterator
from hanlp.utils.time_util import CountdownTimer


class CoNLL2012BIOSRLDataset(TransformableDataset):
    def load_file(self, filepath: str):
        filepath = get_resource(filepath)
        if os.path.isfile(filepath):
            files = [filepath]
        else:
            assert os.path.isdir(filepath), f'{filepath} has to be a directory of CoNLL 2012'
            files = sorted(glob.glob(f'{filepath}/**/*gold_conll', recursive=True))
        timer = CountdownTimer(len(files))
        for fid, f in enumerate(files):
            timer.log(f'files loading[blink][yellow]...[/yellow][/blink]')
            # 0:DOCUMENT 1:PART 2:INDEX 3:WORD 4:POS 5:PARSE 6:LEMMA 7:FRAME 8:SENSE 9:SPEAKER 10:NE 11-N:ARGS N:COREF
            for sent in read_tsv_as_sents(f, ignore_prefix='#'):
                sense = [cell[7] for cell in sent]
                props = [cell[11:-1] for cell in sent]
                props = map(lambda p: p, zip(*props))
                prd_bio_labels = [self._make_bio_labels(prop) for prop in props]
                prd_bio_labels = [self._remove_B_V(x) for x in prd_bio_labels]
                prd_indices = [i for i, x in enumerate(sense) if x != '-']
                token = [x[3] for x in sent]
                srl = [None for x in token]
                for idx, labels in zip(prd_indices, prd_bio_labels):
                    srl[idx] = labels
                srl = [x if x else ['O'] * len(token) for x in srl]
                yield {'token': token, 'srl': srl}

    @staticmethod
    def _make_bio_labels(prop):
        """Copied from https://github.com/hiroki13/span-based-srl/blob/2c8b677c4e00b6c607e09ef4f9fe3d54961e4f2e/src/utils/sent.py#L42

        Args:
          prop: 1D: n_words; elem=bracket label

        Returns:
          1D: n_words; elem=BIO label

        """
        labels = []
        prev = None
        for arg in prop:
            if arg.startswith('('):
                if arg.endswith(')'):
                    prev = arg.split("*")[0][1:]
                    label = 'B-' + prev
                    prev = None
                else:
                    prev = arg[1:-1]
                    label = 'B-' + prev
            else:
                if prev:
                    label = 'I-' + prev
                    if arg.endswith(')'):
                        prev = None
                else:
                    label = 'O'
            labels.append(label)
        return labels

    @staticmethod
    def _remove_B_V(labels):
        return ['O' if x == 'B-V' else x for x in labels]


class CoNLL2012SRLDataset(TransformableDataset):

    def __init__(self,
                 data: Union[str, List],
                 transform: Union[Callable, List] = None,
                 cache=None,
                 doc_level_offset=True,
                 generate_idx=None) -> None:
        self.doc_level_offset = doc_level_offset
        super().__init__(data, transform, cache, generate_idx=generate_idx)

    def load_file(self, filepath: str):
        """Load ``.jsonlines`` CoNLL12-style corpus. Samples of this corpus can be found using the following scripts.

        .. highlight:: python
        .. code-block:: python

            import json
            from hanlp_common.document import Document
            from hanlp.datasets.srl.ontonotes5.chinese import ONTONOTES5_CONLL12_CHINESE_DEV
            from hanlp.utils.io_util import get_resource

            with open(get_resource(ONTONOTES5_CONLL12_CHINESE_DEV)) as src:
                for line in src:
                    doc = json.loads(line)
                    print(Document(doc))
                    break

        Args:
            filepath: ``.jsonlines`` CoNLL12 corpus.
        """
        filename = os.path.basename(filepath)
        reader = TimingFileIterator(filepath)
        num_docs, num_sentences = 0, 0
        for line in reader:
            doc = json.loads(line)
            num_docs += 1
            num_tokens_in_doc = 0
            for sid, (sentence, srl) in enumerate(zip(doc['sentences'], doc['srl'])):
                if self.doc_level_offset:
                    srl = [(x[0] - num_tokens_in_doc, x[1] - num_tokens_in_doc, x[2] - num_tokens_in_doc, x[3]) for x in
                           srl]
                else:
                    srl = [(x[0], x[1], x[2], x[3]) for x in srl]
                for x in srl:
                    if any([o < 0 for o in x[:3]]):
                        raise ValueError(f'Negative offset occurred, maybe doc_level_offset=False')
                    if any([o >= len(sentence) for o in x[:3]]):
                        raise ValueError('Offset exceeds sentence length, maybe doc_level_offset=True')
                deduplicated_srl = set()
                pa_set = set()
                for p, b, e, l in srl:
                    pa = (p, b, e)
                    if pa in pa_set:
                        continue
                    pa_set.add(pa)
                    deduplicated_srl.add((p, b, e, l))
                yield self.build_sample(sentence, deduplicated_srl, doc, sid)
                num_sentences += 1
                num_tokens_in_doc += len(sentence)
            reader.log(
                f'{filename} {num_docs} documents, {num_sentences} sentences [blink][yellow]...[/yellow][/blink]')
        reader.erase()

    # noinspection PyMethodMayBeStatic
    def build_sample(self, sentence, deduplicated_srl, doc, sid):
        return {
            'token': sentence,
            'srl': deduplicated_srl
        }


def group_pa_by_p(sample: dict) -> dict:
    if 'srl' in sample:
        srl: list = sample['srl']
        grouped_srl = group_pa_by_p_(srl)
        sample['srl'] = grouped_srl
    return sample


def group_pa_by_p_(srl):
    grouped_srl = {}
    for p, b, e, l in srl:
        bel = grouped_srl.get(p, None)
        if not bel:
            bel = grouped_srl[p] = set()
        bel.add((b, e, l))
    return grouped_srl


def filter_v_args(sample: dict) -> dict:
    if 'srl' in sample:
        sample['srl'] = [t for t in sample['srl'] if t[-1] not in ["V", "C-V"]]
    return sample


def unpack_srl(sample: dict) -> dict:
    if 'srl' in sample:
        srl = sample['srl']
        predicate_offset = [x[0] for x in srl]
        argument_begin_offset = [x[1] for x in srl]
        argument_end_offset = [x[2] for x in srl]
        srl_label = [x[-1] for x in srl]
        sample.update({
            'predicate_offset': predicate_offset,
            'argument_begin_offset': argument_begin_offset,
            'argument_end_offset': argument_end_offset,
            'srl_label': srl_label,  # We can obtain mask by srl_label > 0
            # 'srl_mask': len(srl_label),
        })
    return sample


class SpanCandidatesGenerator(NamedTransform):

    def __init__(self, src: str, dst: str = None, max_span_width=None) -> None:
        if not dst:
            dst = f'{src}_span'
        super().__init__(src, dst)
        self.max_span_width = max_span_width

    def __call__(self, sample: dict) -> dict:
        sample[self.dst] = list(enumerate_spans(sample[self.src], max_span_width=self.max_span_width))
        return sample


class CoNLL2012SRLBIODataset(CoNLL2012SRLDataset):
    def build_sample(self, tokens, deduplicated_srl, doc, sid):
        # Convert srl to exclusive format
        deduplicated_srl = set((x[0], x[1], x[2] + 1, x[3]) for x in deduplicated_srl if x[3] != 'V')
        labels = [['O'] * len(tokens) for _ in range(len(tokens))]
        srl = group_pa_by_p_(deduplicated_srl)
        for p, args in sorted(srl.items()):
            labels_per_p = labels[p]
            for start, end, label in args:
                assert end > start
                assert label != 'V'  # We don't predict predicate
                labels_per_p[start] = 'B-' + label
                for j in range(start + 1, end):
                    labels_per_p[j] = 'I-' + label
        sample = {
            'token': tokens,
            'srl': labels,
            'srl_set': deduplicated_srl,
        }
        if 'pos' in doc:
            sample['pos'] = doc['pos'][sid]
        return sample


================================================
FILE: hanlp/datasets/srl/loaders/ontonotes_loader.py
================================================
from typing import DefaultDict, List, Optional, Iterator, Set, Tuple, Dict
from collections import defaultdict
import codecs
import os
import logging

from hanlp.utils.span_util import TypedSpan, enumerate_spans
from phrasetree.tree import Tree

logger = logging.getLogger(__name__)


class OntonotesSentence:
    """
    A class representing the annotations available for a single CONLL formatted sentence.

    # Parameters

    document_id : `str`
        This is a variation on the document filename
    sentence_id : `int`
        The integer ID of the sentence within a document.
    words : `List[str]`
        This is the tokens as segmented/tokenized in the Treebank.
    pos_tags : `List[str]`
        This is the Penn-Treebank-style part of speech. When parse information is missing,
        all parts of speech except the one for which there is some sense or proposition
        annotation are marked with a XX tag. The verb is marked with just a VERB tag.
    parse_tree : `nltk.Tree`
        An nltk Tree representing the parse. It includes POS tags as pre-terminal nodes.
        When the parse information is missing, the parse will be `None`.
    predicate_lemmas : `List[Optional[str]]`
        The predicate lemma of the words for which we have semantic role
        information or word sense information. All other indices are `None`.
    predicate_framenet_ids : `List[Optional[int]]`
        The PropBank frameset ID of the lemmas in `predicate_lemmas`, or `None`.
    word_senses : `List[Optional[float]]`
        The word senses for the words in the sentence, or `None`. These are floats
        because the word sense can have values after the decimal, like `1.1`.
    speakers : `List[Optional[str]]`
        The speaker information for the words in the sentence, if present, or `None`
        This is the speaker or author name where available. Mostly in Broadcast Conversation
        and Web Log data. When not available the rows are marked with an "-".
    named_entities : `List[str]`
        The BIO tags for named entities in the sentence.
    srl_frames : `List[Tuple[str, List[str]]]`
        A dictionary keyed by the verb in the sentence for the given
        Propbank frame labels, in a BIO format.
    coref_spans : `Set[TypedSpan]`
        The spans for entity mentions involved in coreference resolution within the sentence.
        Each element is a tuple composed of (cluster_id, (start_index, end_index)). Indices
        are `inclusive`.
    """

    def __init__(
        self,
        document_id: str,
        sentence_id: int,
        words: List[str],
        pos_tags: List[str],
        parse_tree: Optional[Tree],
        predicate_lemmas: List[Optional[str]],
        predicate_framenet_ids: List[Optional[str]],
        word_senses: List[Optional[float]],
        speakers: List[Optional[str]],
        named_entities: List[str],
        srl_frames: List[Tuple[str, List[str]]],
        coref_spans: Set[TypedSpan],
    ) -> None:

        self.document_id = document_id
        self.sentence_id = sentence_id
        self.words = words
        self.pos_tags = pos_tags
        self.parse_tree = parse_tree
        self.predicate_lemmas = predicate_lemmas
        self.predicate_framenet_ids = predicate_framenet_ids
        self.word_senses = word_senses
        self.speakers = speakers
        self.named_entities = named_entities
        self.srl_frames = srl_frames
        self.coref_spans = coref_spans


class Ontonotes:
    """
    This `DatasetReader` is designed to read in the English OntoNotes v5.0 data
    in the format used by the CoNLL 2011/2012 shared tasks. In order to use this
    Reader, you must follow the instructions provided [here (v12 release):]
    (https://cemantix.org/data/ontonotes.html), which will allow you to download
    the CoNLL style annotations for the  OntoNotes v5.0 release -- LDC2013T19.tgz
    obtained from LDC.

    Once you have run the scripts on the extracted data, you will have a folder
    structured as follows:

    ```
    conll-formatted-ontonotes-5.0/
     ── data
       ├── development
           └── data
               └── english
                   └── annotations
                       ├── bc
                       ├── bn
                       ├── mz
                       ├── nw
                       ├── pt
                       ├── tc
                       └── wb
       ├── test
           └── data
               └── english
                   └── annotations
                       ├── bc
                       ├── bn
                       ├── mz
                       ├── nw
                       ├── pt
                       ├── tc
                       └── wb
       └── train
           └── data
               └── english
                   └── annotations
                       ├── bc
                       ├── bn
                       ├── mz
                       ├── nw
                       ├── pt
                       ├── tc
                       └── wb
    ```

    The file path provided to this class can then be any of the train, test or development
    directories(or the top level data directory, if you are not utilizing the splits).

    The data has the following format, ordered by column.

    1.  Document ID : `str`
        This is a variation on the document filename
    2.  Part number : `int`
        Some files are divided into multiple parts numbered as 000, 001, 002, ... etc.
    3.  Word number : `int`
        This is the word index of the word in that sentence.
    4.  Word : `str`
        This is the token as segmented/tokenized in the Treebank. Initially the `*_skel` file
        contain the placeholder [WORD] which gets replaced by the actual token from the
        Treebank which is part of the OntoNotes release.
    5.  POS Tag : `str`
        This is the Penn Treebank style part of speech. When parse information is missing,
        all part of speeches except the one for which there is some sense or proposition
        annotation are marked with a XX tag. The verb is marked with just a VERB tag.
    6.  Parse bit : `str`
        This is the bracketed structure broken before the first open parenthesis in the parse,
        and the word/part-of-speech leaf replaced with a `*`. When the parse information is
        missing, the first word of a sentence is tagged as `(TOP*` and the last word is tagged
        as `*)` and all intermediate words are tagged with a `*`.
    7.  Predicate lemma : `str`
        The predicate lemma is mentioned for the rows for which we have semantic role
        information or word sense information. All other rows are marked with a "-".
    8.  Predicate Frameset ID : `int`
        The PropBank frameset ID of the predicate in Column 7.
    9.  Word sense : `float`
        This is the word sense of the word in Column 3.
    10. Speaker/Author : `str`
        This is the speaker or author name where available. Mostly in Broadcast Conversation
        and Web Log data. When not available the rows are marked with an "-".
    11. Named Entities : `str`
        These columns identifies the spans representing various named entities. For documents
        which do not have named entity annotation, each line is represented with an `*`.
    12. Predicate Arguments : `str`
        There is one column each of predicate argument structure information for the predicate
        mentioned in Column 7. If there are no predicates tagged in a sentence this is a
        single column with all rows marked with an `*`.
    -1. Co-reference : `str`
        Co-reference chain information encoded in a parenthesis structure. For documents that do
         not have co-reference annotations, each line is represented with a "-".
    """

    def dataset_iterator(self, file_path: str) -> Iterator[OntonotesSentence]:
        """
        An iterator over the entire dataset, yielding all sentences processed.
        """
        for conll_file in self.dataset_path_iterator(file_path):
            yield from self.sentence_iterator(conll_file)

    @staticmethod
    def dataset_path_iterator(file_path: str) -> Iterator[str]:
        """
        An iterator returning file_paths in a directory
        containing CONLL-formatted files.
        """
        logger.info("Reading CONLL sentences from dataset files at: %s", file_path)
        for root, _, files in list(os.walk(file_path)):
            for data_file in files:
                # These are a relic of the dataset pre-processing. Every
                # file will be duplicated - one file called filename.gold_skel
                # and one generated from the preprocessing called filename.gold_conll.
                if not data_file.endswith("gold_conll"):
                    continue

                yield os.path.join(root, data_file)

    def dataset_document_iterator(self, file_path: str) -> Iterator[List[OntonotesSentence]]:
        """
        An iterator over CONLL formatted files which yields documents, regardless
        of the number of document annotations in a particular file. This is useful
        for conll data which has been preprocessed, such as the preprocessing which
        takes place for the 2012 CONLL Coreference Resolution task.
        """
        with codecs.open(file_path, "r", encoding="utf8") as open_file:
            conll_rows = []
            document: List[OntonotesSentence] = []
            for line in open_file:
                line = line.strip()
                if line != "" and not line.startswith("#"):
                    # Non-empty line. Collect the annotation.
                    conll_rows.append(line)
                else:
                    if conll_rows:
                        document.append(self._conll_rows_to_sentence(conll_rows))
                        conll_rows = []
                if line.startswith("#end document"):
                    yield document
                    document = []
            if document:
                # Collect any stragglers or files which might not
                # have the '#end document' format for the end of the file.
                yield document

    def sentence_iterator(self, file_path: str) -> Iterator[OntonotesSentence]:
        """
        An iterator over the sentences in an individual CONLL formatted file.
        """
        for document in self.dataset_document_iterator(file_path):
            for sentence in document:
                yield sentence

    def _conll_rows_to_sentence(self, conll_rows: List[str]) -> OntonotesSentence:
        document_id: str = None
        sentence_id: int = None
        # The words in the sentence.
        sentence: List[str] = []
        # The pos tags of the words in the sentence.
        pos_tags: List[str] = []
        # the pieces of the parse tree.
        parse_pieces: List[str] = []
        # The lemmatised form of the words in the sentence which
        # have SRL or word sense information.
        predicate_lemmas: List[str] = []
        # The FrameNet ID of the predicate.
        predicate_framenet_ids: List[str] = []
        # The sense of the word, if available.
        word_senses: List[float] = []
        # The current speaker, if available.
        speakers: List[str] = []

        verbal_predicates: List[str] = []
        span_labels: List[List[str]] = []
        current_span_labels: List[str] = []

        # Cluster id -> List of (start_index, end_index) spans.
        clusters: DefaultDict[int, List[Tuple[int, int]]] = defaultdict(list)
        # Cluster id -> List of start_indices which are open for this id.
        coref_stacks: DefaultDict[int, List[int]] = defaultdict(list)

        for index, row in enumerate(conll_rows):
            conll_components = row.split()

            document_id = conll_components[0]
            sentence_id = int(conll_components[1])
            word = conll_components[3]
            pos_tag = conll_components[4]
            parse_piece = conll_components[5]

            # Replace brackets in text and pos tags
            # with a different token for parse trees.
            if pos_tag != "XX" and word != "XX":
                if word == "(":
                    parse_word = "-LRB-"
                elif word == ")":
                    parse_word = "-RRB-"
                else:
                    parse_word = word
                if pos_tag == "(":
                    pos_tag = "-LRB-"
                if pos_tag == ")":
                    pos_tag = "-RRB-"
                (left_brackets, right_hand_side) = parse_piece.split("*")
                # only keep ')' if there are nested brackets with nothing in them.
                right_brackets = right_hand_side.count(")") * ")"
                parse_piece = f"{left_brackets} ({pos_tag} {parse_word}) {right_brackets}"
            else:
                # There are some bad annotations in the CONLL data.
                # They contain no information, so to make this explicit,
                # we just set the parse piece to be None which will result
                # in the overall parse tree being None.
                parse_piece = None

            lemmatised_word = conll_components[6]
            framenet_id = conll_components[7]
            word_sense = conll_components[8]
            speaker = conll_components[9]

            if not span_labels:
                # If this is the first word in the sentence, create
                # empty lists to collect the NER and SRL BIO labels.
                # We can't do this upfront, because we don't know how many
                # components we are collecting, as a sentence can have
                # variable numbers of SRL frames.
                span_labels = [[] for _ in conll_components[10:-1]]
                # Create variables representing the current label for each label
                # sequence we are collecting.
                current_span_labels = [None for _ in conll_components[10:-1]]

            self._process_span_annotations_for_word(
                conll_components[10:-1], span_labels, current_span_labels
            )

            # If any annotation marks this word as a verb predicate,
            # we need to record its index. This also has the side effect
            # of ordering the verbal predicates by their location in the
            # sentence, automatically aligning them with the annotations.
            word_is_verbal_predicate = any("(V" in x for x in conll_components[11:-1])
            if word_is_verbal_predicate:
                verbal_predicates.append(word)

            self._process_coref_span_annotations_for_word(
                conll_components[-1], index, clusters, coref_stacks
            )

            sentence.append(word)
            pos_tags.append(pos_tag)
            parse_pieces.append(parse_piece)
            predicate_lemmas.append(lemmatised_word if lemmatised_word != "-" else None)
            predicate_framenet_ids.append(framenet_id if framenet_id != "-" else None)
            word_senses.append(float(word_sense) if word_sense != "-" else None)
            speakers.append(speaker if speaker != "-" else None)

        named_entities = span_labels[0]
        srl_frames = [
            (predicate, labels) for predicate, labels in zip(verbal_predicates, span_labels[1:])
        ]

        if all(parse_pieces):
            parse_tree = Tree.fromstring("".join(parse_pieces))
        else:
            parse_tree = None
        coref_span_tuples: Set[TypedSpan] = {
            (cluster_id, span) for cluster_id, span_list in clusters.items() for span in span_list
        }
        return OntonotesSentence(
            document_id,
            sentence_id,
            sentence,
            pos_tags,
            parse_tree,
            predicate_lemmas,
            predicate_framenet_ids,
            word_senses,
            speakers,
            named_entities,
            srl_frames,
            coref_span_tuples,
        )

    @staticmethod
    def _process_coref_span_annotations_for_word(
        label: str,
        word_index: int,
        clusters: DefaultDict[int, List[Tuple[int, int]]],
        coref_stacks: DefaultDict[int, List[int]],
    ) -> None:
        """
        For a given coref label, add it to a currently open span(s), complete a span(s) or
        ignore it, if it is outside of all spans. This method mutates the clusters and coref_stacks
        dictionaries.

        # Parameters

        label : `str`
            The coref label for this word.
        word_index : `int`
            The word index into the sentence.
        clusters : `DefaultDict[int, List[Tuple[int, int]]]`
            A dictionary mapping cluster ids to lists of inclusive spans into the
            sentence.
        coref_stacks : `DefaultDict[int, List[int]]`
            Stacks for each cluster id to hold the start indices of active spans (spans
            which we are inside of when processing a given word). Spans with the same id
            can be nested, which is why we collect these opening spans on a stack, e.g:

            [Greg, the baker who referred to [himself]_ID1 as 'the bread man']_ID1
        """
        if label != "-":
            for segment in label.split("|"):
                # The conll representation of coref spans allows spans to
                # overlap. If spans end or begin at the same word, they are
                # separated by a "|".
                if segment[0] == "(":
                    # The span begins at this word.
                    if segment[-1] == ")":
                        # The span begins and ends at this word (single word span).
                        cluster_id = int(segment[1:-1])
                        clusters[cluster_id].append((word_index, word_index))
                    else:
                        # The span is starting, so we record the index of the word.
                        cluster_id = int(segment[1:])
                        coref_stacks[cluster_id].append(word_index)
                else:
                    # The span for this id is ending, but didn't start at this word.
                    # Retrieve the start index from the document state and
                    # add the span to the clusters for this id.
                    cluster_id = int(segment[:-1])
                    start = coref_stacks[cluster_id].pop()
                    clusters[cluster_id].append((start, word_index))

    @staticmethod
    def _process_span_annotations_for_word(
        annotations: List[str],
        span_labels: List[List[str]],
        current_span_labels: List[Optional[str]],
    ) -> None:
        """
        Given a sequence of different label types for a single word and the current
        span label we are inside, compute the BIO tag for each label and append to a list.

        # Parameters

        annotations : `List[str]`
            A list of labels to compute BIO tags for.
        span_labels : `List[List[str]]`
            A list of lists, one for each annotation, to incrementally collect
            the BIO tags for a sequence.
        current_span_labels : `List[Optional[str]]`
            The currently open span per annotation type, or `None` if there is no open span.
        """
        for annotation_index, annotation in enumerate(annotations):
            # strip all bracketing information to
            # get the actual propbank label.
            label = annotation.strip("()*")

            if "(" in annotation:
                # Entering into a span for a particular semantic role label.
                # We append the label and set the current span for this annotation.
                bio_label = "B-" + label
                span_labels[annotation_index].append(bio_label)
                current_span_labels[annotation_index] = label
            elif current_span_labels[annotation_index] is not None:
                # If there's no '(' token, but the current_span_label is not None,
                # then we are inside a span.
                bio_label = "I-" + current_span_labels[annotation_index]
                span_labels[annotation_index].append(bio_label)
            else:
                # We're outside a span.
                span_labels[annotation_index].append("O")
            # Exiting a span, so we reset the current span label for this annotation.
            if ")" in annotation:
                current_span_labels[annotation_index] = None


def make_coref_instance(
        sentences: List[List[str]],
        max_span_width: int,
        gold_clusters: Optional[List[List[Tuple[int, int]]]] = None,
        max_sentences: int = None,
        remove_singleton_clusters: bool = True,
) -> dict:
    """
    # Parameters

    sentences : `List[List[str]]`, required.
        A list of lists representing the tokenised words and sentences in the document.
    token_indexers : `Dict[str, TokenIndexer]`
        This is used to index the words in the document.  See :class:`TokenIndexer`.
    max_span_width : `int`, required.
        The maximum width of candidate spans to consider.
    gold_clusters : `Optional[List[List[Tuple[int, int]]]]`, optional (default = None)
        A list of all clusters in the document, represented as word spans with absolute indices
        in the entire document. Each cluster contains some number of spans, which can be nested
        and overlap. If there are exact matches between clusters, they will be resolved
        using `_canonicalize_clusters`.
    wordpiece_modeling_tokenizer: `PretrainedTransformerTokenizer`, optional (default = None)
        If not None, this dataset reader does subword tokenization using the supplied tokenizer
        and distribute the labels to the resulting wordpieces. All the modeling will be based on
        wordpieces. If this is set to `False` (default), the user is expected to use
        `PretrainedTransformerMismatchedIndexer` and `PretrainedTransformerMismatchedEmbedder`,
        and the modeling will be on the word-level.
    max_sentences: int, optional (default = None)
        The maximum number of sentences in each document to keep. By default keeps all sentences.
    remove_singleton_clusters : `bool`, optional (default = True)
        Some datasets contain clusters that are singletons (i.e. no coreferents). This option allows
        the removal of them.

    # Returns

    An `Instance` containing the following `Fields`:
        text : `TextField`
            The text of the full document.
        spans : `ListField[SpanField]`
            A ListField containing the spans represented as `SpanFields`
            with respect to the document text.
        span_labels : `SequenceLabelField`, optional
            The id of the cluster which each possible span belongs to, or -1 if it does
                not belong to a cluster. As these labels have variable length (it depends on
                how many spans we are considering), we represent this a as a `SequenceLabelField`
                with respect to the spans `ListField`.
    """
    if max_sentences is not None and len(sentences) > max_sentences:
        sentences = sentences[:max_sentences]
        total_length = sum(len(sentence) for sentence in sentences)

        if gold_clusters is not None:
            new_gold_clusters = []

            for cluster in gold_clusters:
                new_cluster = []
                for mention in cluster:
                    if mention[1] < total_length:
                        new_cluster.append(mention)
                if new_cluster:
                    new_gold_clusters.append(new_cluster)

            gold_clusters = new_gold_clusters

    flattened_sentences = [_normalize_word(word) for sentence in sentences for word in sentence]
    flat_sentences_tokens = [word for word in flattened_sentences]

    text_field = flat_sentences_tokens

    cluster_dict = {}
    if gold_clusters is not None:
        gold_clusters = _canonicalize_clusters(gold_clusters)
        if remove_singleton_clusters:
            gold_clusters = [cluster for cluster in gold_clusters if len(cluster) > 1]

        for cluster_id, cluster in enumerate(gold_clusters):
            for mention in cluster:
                cluster_dict[tuple(mention)] = cluster_id

    spans: List = []
    span_labels: Optional[List[int]] = [] if gold_clusters is not None else None

    sentence_offset = 0
    for sentence in sentences:
        for start, end in enumerate_spans(
                sentence, offset=sentence_offset, max_span_width=max_span_width
        ):

            if span_labels is not None:
                if (start, end) in cluster_dict:
                    span_labels.append(cluster_dict[(start, end)])
                else:
                    span_labels.append(-1)

            spans.append((start, end))
        sentence_offset += len(sentence)

    span_field = spans

    # metadata: Dict[str, Any] = {"original_text": flattened_sentences}
    # if gold_clusters is not None:
    #     metadata["clusters"] = gold_clusters
    # metadata_field = MetadataField(metadata)

    fields: Dict[str, List] = {
        "text": text_field,
        "spans": span_field,
        'clusters': gold_clusters,
        # "metadata": metadata_field,
    }
    if span_labels is not None:
        fields["span_labels"] = span_labels

    return fields


def _normalize_word(word):
    if word in ("/.", "/?"):
        return word[1:]
    else:
        return word


def _canonicalize_clusters(clusters: List[List[Tuple[int, int]]]) -> List[List[Tuple[int, int]]]:
    """
    The data might include 2 annotated spans which are identical,
    but have different ids. This checks all clusters for spans which are
    identical, and if it finds any, merges the clusters containing the
    identical spans.
    """
    merged_clusters: List[Set[Tuple[int, int]]] = []
    for cluster in clusters:
        cluster_with_overlapping_mention = None
        for mention in cluster:
            # Look at clusters we have already processed to
            # see if they contain a mention in the current
            # cluster for comparison.
            for cluster2 in merged_clusters:
                if mention in cluster2:
                    # first cluster in merged clusters
                    # which contains this mention.
                    cluster_with_overlapping_mention = cluster2
                    break
            # Already encountered overlap - no need to keep looking.
            if cluster_with_overlapping_mention is not None:
                break
        if cluster_with_overlapping_mention is not None:
            # Merge cluster we are currently processing into
            # the cluster in the processed list.
            cluster_with_overlapping_mention.update(cluster)
        else:
            merged_clusters.append(set(cluster))
    return [list(c) for c in merged_clusters]

================================================
FILE: hanlp/datasets/srl/ontonotes5/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-26 16:07
ONTONOTES5_HOME = 'https://catalog.ldc.upenn.edu/LDC2013T19/LDC2013T19.tgz#/ontonotes-release-5.0/data/'
CONLL12_HOME = ONTONOTES5_HOME + '../conll-2012/'


================================================
FILE: hanlp/datasets/srl/ontonotes5/_utils.py
================================================
#!/usr/bin/env python
import codecs
import collections
import glob
import json
import os
import re
import sys
from pprint import pprint
from typing import List, Dict, Union

from hanlp_common.io import eprint, save_json

from hanlp.common.transform import NormalizeToken
from hanlp.datasets.parsing.loaders._ctb_utils import remove_all_ec, convert_to_dependency
from hanlp.datasets.parsing.ptb import PTB_TOKEN_MAPPING
from hanlp.utils.io_util import merge_files, get_resource, pushd, run_cmd, read_tsv_as_sents, replace_ext, \
    get_exitcode_stdout_stderr
from hanlp.utils.log_util import flash

BEGIN_DOCUMENT_REGEX = re.compile(r"#begin document \((.*)\); part (\d+)")


def flatten(l):
    return [item for sublist in l for item in sublist]


def get_doc_key(doc_id, part):
    return "{}_{}".format(doc_id, int(part))


class DocumentState(object):
    def __init__(self):
        self.doc_key = None
        self.text = []
        self.text_speakers = []
        self.speakers = []
        self.sentences = []
        self.pos = []
        self.lemma = []
        self.pos_buffer = []
        self.lemma_buffer = []
        self.constituents = []  # {}
        self.const_stack = []
        self.const_buffer = []
        self.ner = []
        self.ner_stack = []
        self.ner_buffer = []
        self.srl = []
        self.argument_stacks = []
        self.argument_buffers = []
        self.predicate_buffer = []
        self.clusters = collections.defaultdict(list)
        self.coref_stacks = collections.defaultdict(list)

    def assert_empty(self):
        assert self.doc_key is None
        assert len(self.text) == 0
        assert len(self.text_speakers) == 0
        assert len(self.speakers) == 0
        assert len(self.sentences) == 0
        assert len(self.srl) == 0
        assert len(self.predicate_buffer) == 0
        assert len(self.argument_buffers) == 0
        assert len(self.argument_stacks) == 0
        assert len(self.constituents) == 0
        assert len(self.const_stack) == 0
        assert len(self.const_buffer) == 0
        assert len(self.ner) == 0
        assert len(self.lemma_buffer) == 0
        assert len(self.pos_buffer) == 0
        assert len(self.ner_stack) == 0
        assert len(self.ner_buffer) == 0
        assert len(self.coref_stacks) == 0
        assert len(self.clusters) == 0

    def assert_finalizable(self):
        assert self.doc_key is not None
        assert len(self.text) == 0
        assert len(self.text_speakers) == 0
        assert len(self.speakers) > 0
        assert len(self.sentences) > 0
        assert len(self.constituents) > 0
        assert len(self.const_stack) == 0
        assert len(self.ner_stack) == 0
        assert len(self.predicate_buffer) == 0
        assert all(len(s) == 0 for s in list(self.coref_stacks.values()))

    def finalize_sentence(self):
        self.sentences.append(tuple(self.text))
        del self.text[:]
        self.lemma.append(tuple(self.lemma_buffer))
        del self.lemma_buffer[:]
        self.pos.append(tuple(self.pos_buffer))
        del self.pos_buffer[:]
        self.speakers.append(tuple(self.text_speakers))
        del self.text_speakers[:]

        assert len(self.predicate_buffer) == len(self.argument_buffers)
        self.srl.append([])
        for pred, args in zip(self.predicate_buffer, self.argument_buffers):
            for start, end, label in args:
                self.srl[-1].append((pred, start, end, label))
        self.predicate_buffer = []
        self.argument_buffers = []
        self.argument_stacks = []
        self.constituents.append([c for c in self.const_buffer])
        self.const_buffer = []
        self.ner.append([c for c in self.ner_buffer])
        self.ner_buffer = []

    def finalize(self):
        merged_clusters = []
        for c1 in list(self.clusters.values()):
            existing = None
            for m in c1:
                for c2 in merged_clusters:
                    if m in c2:
                        existing = c2
                        break
                if existing is not None:
                    break
            if existing is not None:
                print("Merging clusters (shouldn't happen very often.)")
                existing.update(c1)
            else:
                merged_clusters.append(set(c1))
        merged_clusters = [list(c) for c in merged_clusters]
        all_mentions = flatten(merged_clusters)
        assert len(all_mentions) == len(set(all_mentions))
        assert len(self.sentences) == len(self.srl)
        assert len(self.sentences) == len(self.constituents)
        assert len(self.sentences) == len(self.ner)
        return {
            "doc_key": self.doc_key,
            "sentences": self.sentences,
            "lemma": self.lemma,
            "pos": self.pos,
            "speakers": self.speakers,
            "srl": self.srl,
            "constituents": self.constituents,
            "ner": self.ner,
            "clusters": merged_clusters
        }


def filter_data(input_json_file, output_json_file, doc_ids_file=None, annotation=None):
    """Filter OntoNotes5 data based on CoNLL2012 (coref) doc ids.
    https://github.com/bcmi220/unisrl/blob/master/scripts/filter_conll2012_data.py

    Args:
      input_json_file: All documents.
      output_json_file:
      doc_ids_file:

    Returns:

    """
    assert doc_ids_file or annotation
    doc_count = 0
    sentence_count = 0
    srl_count = 0
    ner_count = 0
    cluster_count = 0
    word_count = 0
    missing_count = 0
    doc_ids = []
    doc_ids_to_keys = collections.defaultdict(list)
    filtered_examples = {}
    ontonotes_root = os.path.abspath(os.path.join(os.path.dirname(input_json_file), *['..'] * 2))
    language = os.path.basename(input_json_file).split('.')[1]

    if doc_ids_file:
        with open(doc_ids_file, "r") as f:
            for line in f:
                doc_id = line.strip().split("annotations/")[1]
                doc_ids.append(doc_id)
                doc_ids_to_keys[doc_id] = []
            f.close()

    with codecs.open(input_json_file, "r", "utf8") as f:
        for jsonline in f:
            example = json.loads(jsonline)
            doc_key = example["doc_key"]
            dk_prefix = "_".join(doc_key.split("_")[:-1])
            if doc_ids_file and dk_prefix not in doc_ids_to_keys:
                continue
            if annotation and not os.path.isfile(
                    os.path.join(ontonotes_root, 'data/files/data', language, 'annotations', dk_prefix) + annotation):
                print(os.path.join(ontonotes_root, 'data/files/data', language, 'annotations', dk_prefix) + annotation)
                missing_count += 1
                continue
            doc_ids_to_keys[dk_prefix].append(doc_key)
            filtered_examples[doc_key] = example

            sentences = example["sentences"]
            word_count += sum([len(s) for s in sentences])
            sentence_count += len(sentences)
            srl_count += sum([len(srl) for srl in example["srl"]])
            ner_count += sum([len(ner) for ner in example["ner"]])
            coref = example["clusters"]
            cluster_count += len(coref)
            doc_count += 1
        f.close()

    print(("Documents: {}\nSentences: {}\nWords: {}\nNER: {}, PAS: {}, Clusters: {}, No annotations: {}".format(
        doc_count, sentence_count, word_count, ner_count, srl_count, cluster_count, missing_count)))

    if doc_ids_file:
        with codecs.open(output_json_file, "w", "utf8") as f:
            for doc_id in doc_ids:  # Arrange the files in order of id files
                for key in doc_ids_to_keys[doc_id]:
                    f.write(json.dumps(filtered_examples[key], ensure_ascii=False))
                    f.write("\n")
            f.close()
    else:
        with codecs.open(output_json_file, "w", "utf8") as f:
            for doc in filtered_examples.values():
                f.write(json.dumps(doc, ensure_ascii=False))
                f.write("\n")
            f.close()


def normalize_word(word, language):
    if language == "arabic":
        word = word[:word.find("#")]
    if word == "/." or word == "/?":
        return word[1:]
    else:
        return word


def handle_bit(word_index, bit, stack, spans, label_set):
    asterisk_idx = bit.find("*")
    if asterisk_idx >= 0:
        open_parens = bit[:asterisk_idx]
        close_parens = bit[asterisk_idx + 1:]
    else:
        open_parens = bit[:-1]
        close_parens = bit[-1]

    current_idx = open_parens.find("(")
    while current_idx >= 0:
        next_idx = open_parens.find("(", current_idx + 1)
        if next_idx >= 0:
            label = open_parens[current_idx + 1:next_idx]
        else:
            label = open_parens[current_idx + 1:]
        label_set.add(label)
        stack.append((word_index, label))
        current_idx = next_idx

    for c in close_parens:
        try:
            assert c == ")"
        except AssertionError:
            print(word_index, bit, spans, stack)
            continue
        open_index, label = stack.pop()
        spans.append((open_index, word_index, label))
        ''' current_span = (open_index, word_index)
        if current_span in spans:
          spans[current_span] += "_" + label
        else:
          spans[current_span] = label
        spans[current_span] = label '''


def handle_line(line, document_state: DocumentState, language, labels, stats):
    begin_document_match = re.match(BEGIN_DOCUMENT_REGEX, line)
    if begin_document_match:
        document_state.assert_empty()
        document_state.doc_key = get_doc_key(begin_document_match.group(1), begin_document_match.group(2))
        return None
    elif line.startswith("#end document"):
        document_state.assert_finalizable()
        finalized_state = document_state.finalize()
        stats["num_clusters"] += len(finalized_state["clusters"])
        stats["num_mentions"] += sum(len(c) for c in finalized_state["clusters"])
        # labels["{}_const_labels".format(language)].update(l for _, _, l in finalized_state["constituents"])
        # labels["ner"].update(l for _, _, l in finalized_state["ner"])
        return finalized_state
    else:
        row = line.split()
        # Starting a new sentence.
        if len(row) == 0:
            stats["max_sent_len_{}".format(language)] = max(len(document_state.text),
                                                            stats["max_sent_len_{}".format(language)])
            stats["num_sents_{}".format(language)] += 1
            document_state.finalize_sentence()
            return None
        assert len(row) >= 12

        doc_key = get_doc_key(row[0], row[1])
        word = normalize_word(row[3], language)
        pos = row[4]
        parse = row[5]
        lemma = row[6]
        predicate_sense = row[7]
        speaker = row[9]
        ner = row[10]
        args = row[11:-1]
        coref = row[-1]

        word_index = len(document_state.text) + sum(len(s) for s in document_state.sentences)
        document_state.text.append(word)
        document_state.text_speakers.append(speaker)
        document_state.pos_buffer.append(pos)
        document_state.lemma_buffer.append(lemma)

        handle_bit(word_index, parse, document_state.const_stack, document_state.const_buffer, labels["categories"])
        handle_bit(word_index, ner, document_state.ner_stack, document_state.ner_buffer, labels["ner"])

        if len(document_state.argument_stacks) < len(args):
            document_state.argument_stacks = [[] for _ in args]
            document_state.argument_buffers = [[] for _ in args]

        for i, arg in enumerate(args):
            handle_bit(word_index, arg, document_state.argument_stacks[i], document_state.argument_buffers[i],
                       labels["srl"])
        if predicate_sense != "-":
            document_state.predicate_buffer.append(word_index)
        if coref != "-":
            for segment in coref.split("|"):
                if segment[0] == "(":
                    if segment[-1] == ")":
                        cluster_id = int(segment[1:-1])
                        document_state.clusters[cluster_id].append((word_index, word_index))
                    else:
                        cluster_id = int(segment[1:])
                        document_state.coref_stacks[cluster_id].append(word_index)
                else:
                    cluster_id = int(segment[:-1])
                    start = document_state.coref_stacks[cluster_id].pop()
                    document_state.clusters[cluster_id].append((start, word_index))
        return None


def ontonotes_document_generator(input_path, language, labels, stats):
    with open(input_path, "r") as input_file:
        document_state = DocumentState()
        for line in input_file.readlines():
            document = handle_line(line, document_state, language, labels, stats)
            if document is not None:
                yield document
                document_state = DocumentState()


def convert_to_jsonlines(input_path, output_path, language, labels=None, stats=None):
    if labels is None:
        labels = collections.defaultdict(set)
    if stats is None:
        stats = collections.defaultdict(int)
    count = 0
    with open(output_path, "w") as output_file:
        for document in ontonotes_document_generator(input_path, language, labels, stats):
            output_file.write(json.dumps(document, ensure_ascii=False))
            output_file.write("\n")
            count += 1

    return labels, stats


def make_ontonotes_jsonlines(conll12_ontonotes_path, output_path, languages=None):
    if languages is None:
        languages = ['english', 'chinese', 'arabic']
    for language in languages:
        make_ontonotes_language_jsonlines(conll12_ontonotes_path, output_path, language)


def make_ontonotes_language_jsonlines(conll12_ontonotes_path, output_path=None, language='english'):
    conll12_ontonotes_path = get_resource(conll12_ontonotes_path)
    if output_path is None:
        output_path = os.path.dirname(conll12_ontonotes_path)
    for split in ['train', 'development', 'test']:
        pattern = f'{conll12_ontonotes_path}/data/{split}/data/{language}/annotations/*/*/*/*gold_conll'
        files = sorted(glob.glob(pattern, recursive=True))
        assert files, f'No gold_conll files found in {pattern}'
        version = os.path.basename(files[0]).split('.')[-1].split('_')[0]
        if version.startswith('v'):
            assert all([version in os.path.basename(f) for f in files])
        else:
            version = 'v5'
        lang_dir = f'{output_path}/{language}'
        if split == 'conll-2012-test':
            split = 'test'
        full_file = f'{lang_dir}/{split}.{language}.{version}_gold_conll'
        os.makedirs(lang_dir, exist_ok=True)
        print(f'Merging {len(files)} files to {full_file}')
        merge_files(files, full_file)
        v5_json_file = full_file.replace(f'.{version}_gold_conll', f'.{version}.jsonlines')
        print(f'Converting CoNLL file {full_file} to json file {v5_json_file}')
        labels, stats = convert_to_jsonlines(full_file, v5_json_file, language)
        print('Labels:')
        pprint(labels)
        print('Statistics:')
        pprint(stats)
        conll12_json_file = f'{lang_dir}/{split}.{language}.conll12.jsonlines'
        print(f'Applying CoNLL 12 official splits on {v5_json_file} to {conll12_json_file}')
        id_file = get_resource(f'https://file.hankcs.com/research/emnlp2021/conll.cemantix.org.zip#2012/download/ids/'
                               f'{language}/coref/{split}.id')
        filter_data(v5_json_file, conll12_json_file, id_file)


def ensure_python_points_to_python2():
    exitcode, out, version = get_exitcode_stdout_stderr('python --version')
    if not version:
        version = out
    if not version.startswith('Python 2'):
        raise EnvironmentError(f'Your python command needs to be Python2, not {version.strip()}. Try:\n\n\t'
                               'ln -sf "$(which python2)" "$(which python)"')


def make_gold_conll(ontonotes_path, language):
    ensure_python_points_to_python2()
    ontonotes_path = os.path.abspath(get_resource(ontonotes_path))
    to_conll = get_resource(
        'https://gist.githubusercontent.com/hankcs/46b9137016c769e4b6137104daf43a92/raw/66369de6c24b5ec47696ae307591f0d72c6f3f02/ontonotes_to_conll.sh')
    to_conll = os.path.abspath(to_conll)
    # shutil.rmtree(os.path.join(ontonotes_path, 'conll-2012'), ignore_errors=True)
    with pushd(ontonotes_path):
        try:
            flash(f'Converting [blue]{language}[/blue] to CoNLL format, '
                  f'this might take half an hour [blink][yellow]...[/yellow][/blink]')
            run_cmd(f'bash {to_conll} {ontonotes_path} {language}')
            flash('')
        except RuntimeError as e:
            flash(f'[red]Failed[/red] to convert {language} of {ontonotes_path} to CoNLL. See exceptions for detail')
            raise e


def convert_jsonlines_to_IOBES(json_file, output_file=None, doc_level_offset=True, normalize_token=False):
    json_file = get_resource(json_file)
    if not output_file:
        output_file = os.path.splitext(json_file)[0] + '.ner.tsv'
    if normalize_token:
        transform = NormalizeToken(PTB_TOKEN_MAPPING, 'token')
    with open(json_file) as src, open(output_file, 'w', encoding='utf-8') as out:
        for line in src:
            doc = json.loads(line)
            offset = 0
            for sent, ner in zip(doc['sentences'], doc['ner']):
                if normalize_token:
                    sent = transform({'token': sent})['token']
                tags = ['O'] * len(sent)
                for start, end, label in ner:
                    if doc_level_offset:
                        start -= offset
                        end -= offset
                    if start == end:
                        tags[start] = 'S-' + label
                    else:
                        tags[start] = 'B-' + label
                        for i in range(start + 1, end + 1):
                            tags[i] = 'I-' + label
                        tags[end] = 'E-' + label
                offset += len(sent)
                for token, tag in zip(sent, tags):
                    out.write(f'{token}\t{tag}\n')
                out.write('\n')


def make_ner_tsv_if_necessary(json_file):
    json_file = get_resource(json_file)
    output_file = os.path.splitext(json_file)[0] + '.ner.tsv'
    if not os.path.isfile(output_file):
        convert_jsonlines_to_IOBES(json_file, output_file)
    return output_file


def batch_make_ner_tsv_if_necessary(json_files):
    for each in json_files:
        make_ner_tsv_if_necessary(each)


def make_pos_tsv_if_necessary(json_file):
    json_file = get_resource(json_file)
    output_file = os.path.splitext(json_file)[0] + '.pos.tsv'
    if not os.path.isfile(output_file):
        make_pos_tsv(json_file, output_file)
    return output_file


def make_pos_tsv(json_file, output_file):
    with open(json_file) as src, open(output_file, 'w', encoding='utf-8') as out:
        for line in src:
            doc = json.loads(line)
            for sent, pos in zip(doc['sentences'], doc['pos']):
                for token, tag in zip(sent, pos):
                    out.write(f'{token}\t{tag}\n')
                out.write('\n')


def batch_make_pos_tsv_if_necessary(json_files):
    for each in json_files:
        make_pos_tsv_if_necessary(each)


def make_con_txt(conll_file, output_file):
    with open(output_file, 'w') as out:
        for sent in read_tsv_as_sents(conll_file):
            tree = []
            pos_per_sent = []
            for cell in sent:
                if cell[0] == '#begin' or cell[0] == '#end':
                    continue
                if len(cell) < 8:
                    print(cell)
                filename, sentence_id, token_id, word, POS, parse, framefile, roleset, *_ = cell
                parse = parse.replace('*', f'({POS} {word})')
                tree.append(parse)
                pos_per_sent.append(POS)
            bracketed = ' '.join(tree)
            out.write(bracketed)
            out.write('\n')


def make_con_txt_if_necessary(json_file):
    json_file = get_resource(json_file)
    output_file = os.path.splitext(json_file)[0] + '.con.txt'
    if not os.path.isfile(output_file):
        make_con_txt(json_file, output_file)
    return output_file


def batch_make_con_txt_if_necessary(json_files):
    for each in json_files:
        make_con_txt_if_necessary(each)


def batch_remove_empty_category_if_necessary(json_files):
    for each in json_files:
        src = get_resource(each)
        dst = replace_ext(src, '.noempty.txt')
        if not os.path.isfile(dst):
            remove_all_ec(src)


def make_dep_conllx(con_txt_file, output_file, language='en'):
    con_txt_file = get_resource(con_txt_file)
    convert_to_dependency(con_txt_file, output_file, language=language)


def make_dep_conllx_if_necessary(con_txt_file: str, language='en'):
    con_txt_file = get_resource(con_txt_file)
    output_file = con_txt_file.replace('.con.txt', '.dep.conllx', 1)
    if os.path.isfile(output_file):
        return
    make_dep_conllx(con_txt_file, output_file, language)


def batch_make_dep_conllx_if_necessary(con_txt_files, language='en'):
    for each in con_txt_files:
        make_dep_conllx_if_necessary(each, language)


def make_ner_json_if_necessary(json_file):
    json_file = get_resource(json_file)
    output_file = os.path.splitext(json_file)[0] + '.ner.jsonlines'
    if not os.path.isfile(output_file):
        make_ner_json(json_file, output_file)
    return output_file


def batch_make_ner_json_if_necessary(json_files):
    for each in json_files:
        make_ner_json_if_necessary(each)


def make_ner_json(json_file, output_file):
    filter_data(json_file, output_file, doc_ids_file=None, annotation='.name')


def make_srl_json_if_necessary(json_file):
    json_file = get_resource(json_file)
    output_file = os.path.splitext(json_file)[0] + '.srl.jsonlines'
    if not os.path.isfile(output_file):
        make_srl_json(json_file, output_file)
    return output_file


def make_coref_json_if_necessary(json_file):
    json_file = get_resource(json_file)
    output_file = os.path.splitext(json_file)[0] + '.coref.jsonlines'
    if not os.path.isfile(output_file):
        make_coref_json(json_file, output_file)
    return output_file


def batch_make_srl_json_if_necessary(json_files):
    for each in json_files:
        make_srl_json_if_necessary(each)


def make_srl_json(json_file, output_file):
    filter_data(json_file, output_file, doc_ids_file=None, annotation='.prop')


def batch_make_coref_json_if_necessary(json_files):
    for each in json_files:
        make_coref_json_if_necessary(each)


def make_coref_json(json_file, output_file):
    filter_data(json_file, output_file, doc_ids_file=None, annotation='.coref')


def load_raw_text(onf_file) -> List[str]:
    with open(onf_file) as src:
        sents = []
        expect_sent = False
        expect_sent_line = False
        sent_parts = []
        for line in src:
            line = line.strip()
            if line == 'Plain sentence:':
                expect_sent_line = True
            elif expect_sent_line:
                expect_sent_line = False
                expect_sent = True
                continue
            elif expect_sent:
                if not line:
                    sents.append(' '.join(sent_parts))
                    expect_sent = False
                    sent_parts = []
                else:
                    sent_parts.append(line)

        return sents


def batch_load_raw_text(root: str) -> Dict[str, List[str]]:
    onf_files = sorted(glob.glob(os.path.join(root, '**/*.onf'), recursive=True))
    sents = dict()
    for path in onf_files:
        filename = path.split('annotations/')[1][:-len('.onf')]
        sents[filename] = load_raw_text(path)
    return sents


def make_raw_text_if_necessary(home: str):
    home = get_resource(home)
    jsonpath = os.path.join(home, 'text.jsonlines')
    if os.path.isfile(jsonpath):
        return
    sents = batch_load_raw_text(home)
    save_json(sents, jsonpath)


class RestoreToken(NormalizeToken):
    def __init__(self, src: str, mapper: Union[str, dict] = None, dst: str = None) -> None:
        if not mapper:
            mapper = {
                '/-': '-',
                '/.': '.',
            }
        super().__init__(mapper, src, dst)

    def __call__(self, sample: dict) -> dict:
        src = sample[self.src]
        src = [[self.convert(y) for y in x] for x in src]
        sample[self.dst] = src
        return sample


def main():
    if len(sys.argv) != 3:
        eprint('2 arguments required: ontonotes_path output_path')
        exit(1)
    ontonotes_path = sys.argv[1]
    output_path = sys.argv[2]
    make_ontonotes_jsonlines(ontonotes_path, output_path)


if __name__ == "__main__":
    main()


================================================
FILE: hanlp/datasets/srl/ontonotes5/chinese.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-26 16:07
import os
from urllib.error import HTTPError
import shutil

from hanlp.datasets.srl.ontonotes5 import ONTONOTES5_HOME, CONLL12_HOME
from hanlp.datasets.srl.ontonotes5._utils import make_gold_conll, make_ontonotes_language_jsonlines, \
    batch_make_ner_tsv_if_necessary, batch_make_pos_tsv_if_necessary, batch_make_con_txt_if_necessary, \
    batch_make_dep_conllx_if_necessary
from hanlp.utils.io_util import get_resource, path_from_url
from hanlp.utils.log_util import cprint, flash

_ONTONOTES5_CHINESE_HOME = ONTONOTES5_HOME + 'files/data/chinese/'
_ONTONOTES5_CONLL12_CHINESE_HOME = CONLL12_HOME + 'chinese/'
ONTONOTES5_CONLL12_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.conll12.jsonlines'
'''Training set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CONLL12_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.conll12.jsonlines'
'''Dev set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CONLL12_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.conll12.jsonlines'
'''Test set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''

ONTONOTES5_CONLL12_NER_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.conll12.ner.tsv'
'''Training set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CONLL12_NER_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.conll12.ner.tsv'
'''Dev set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CONLL12_NER_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.conll12.ner.tsv'
'''Test set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''

ONTONOTES5_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.v4.jsonlines'
ONTONOTES5_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.v4.jsonlines'
ONTONOTES5_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.v4.jsonlines'

ONTONOTES5_CONLL_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.v4_gold_conll'
ONTONOTES5_CONLL_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.v4_gold_conll'
ONTONOTES5_CONLL_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.v4_gold_conll'

ONTONOTES5_POS_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.v4.pos.tsv'
ONTONOTES5_POS_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.v4.pos.tsv'
ONTONOTES5_POS_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.v4.pos.tsv'

ONTONOTES5_CON_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.con.txt'
ONTONOTES5_CON_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.con.txt'
ONTONOTES5_CON_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.con.txt'

ONTONOTES5_DEP_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.dep.conllx'
ONTONOTES5_DEP_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.dep.conllx'
ONTONOTES5_DEP_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.dep.conllx'

# ONTONOTES5_CON_CHINESE_NOEC_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.con.noempty.txt'
# ONTONOTES5_CON_CHINESE_NOEC_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.con.noempty.txt'
# ONTONOTES5_CON_CHINESE_NOEC_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.con.noempty.txt'


ONTONOTES5_NER_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.v4.ner.tsv'
ONTONOTES5_NER_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.v4.ner.tsv'
ONTONOTES5_NER_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.v4.ner.tsv'

try:
    get_resource(ONTONOTES5_HOME, verbose=False)
except HTTPError:
    intended_file_path = path_from_url(ONTONOTES5_HOME)
    cprint('Ontonotes 5.0 is a [red][bold]copyright[/bold][/red] dataset owned by LDC which we cannot re-distribute. '
           f'Please apply for a licence from LDC (https://catalog.ldc.upenn.edu/LDC2016T13) '
           f'then download it to {intended_file_path}')
    cprint('Luckily, an [red]unofficial[/red] Chinese version is provided on GitHub '
           'which will be used for demonstration purpose.')
    unofficial_chinese = get_resource('https://github.com/GuocaiL/Coref_Resolution/archive/master.zip#data/')
    intended_home, _ = os.path.splitext(intended_file_path)
    intended_home = os.path.join(os.path.dirname(intended_home), 'ontonotes-release-5.0')
    intended_chinese = f'{intended_home}/data/files/data/chinese/'
    # print(os.path.dirname(intended_chinese))
    # print(unofficial_chinese)
    # print(intended_chinese)
    for folder in ['annotations', 'metadata']:
        flash(f'Copying {unofficial_chinese}{folder} to {intended_chinese}{folder} [blink][yellow]...[/yellow][/blink]')
        shutil.copytree(f'{unofficial_chinese}{folder}', f'{intended_chinese}{folder}')
    flash('')

try:
    get_resource(ONTONOTES5_CONLL12_CHINESE_TRAIN, verbose=False)
except HTTPError:
    make_gold_conll(ONTONOTES5_HOME + '..', 'chinese')
    make_ontonotes_language_jsonlines(CONLL12_HOME + 'v4', language='chinese')

batch_make_ner_tsv_if_necessary(
    [ONTONOTES5_CONLL12_CHINESE_TRAIN, ONTONOTES5_CONLL12_CHINESE_DEV, ONTONOTES5_CONLL12_CHINESE_TEST])

batch_make_ner_tsv_if_necessary(
    [ONTONOTES5_CONLL12_CHINESE_TRAIN, ONTONOTES5_CONLL12_CHINESE_DEV, ONTONOTES5_CONLL12_CHINESE_TEST])

batch_make_ner_tsv_if_necessary(
    [ONTONOTES5_CHINESE_TRAIN, ONTONOTES5_CHINESE_DEV, ONTONOTES5_CHINESE_TEST])

batch_make_pos_tsv_if_necessary(
    [ONTONOTES5_CHINESE_TRAIN, ONTONOTES5_CHINESE_DEV, ONTONOTES5_CHINESE_TEST])

batch_make_con_txt_if_necessary(
    [ONTONOTES5_CONLL_CHINESE_TRAIN, ONTONOTES5_CONLL_CHINESE_DEV, ONTONOTES5_CONLL_CHINESE_TEST])

batch_make_dep_conllx_if_necessary(
    [ONTONOTES5_CON_CHINESE_TRAIN, ONTONOTES5_CON_CHINESE_DEV, ONTONOTES5_CON_CHINESE_TEST], language='zh')

# batch_remove_empty_category_if_necessary(
#     [ONTONOTES5_CON_CHINESE_TRAIN, ONTONOTES5_CON_CHINESE_DEV, ONTONOTES5_CON_CHINESE_TEST])


================================================
FILE: hanlp/datasets/srl/ontonotes5/english.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-25 18:48


from urllib.error import HTTPError

from hanlp.datasets.srl.ontonotes5 import ONTONOTES5_HOME, CONLL12_HOME
from hanlp.datasets.srl.ontonotes5._utils import make_gold_conll, make_ontonotes_language_jsonlines, \
    batch_make_ner_tsv_if_necessary, batch_make_pos_tsv_if_necessary, batch_make_con_txt_if_necessary, \
    batch_make_dep_conllx_if_necessary
from hanlp.utils.io_util import get_resource, path_from_url
from hanlp.utils.log_util import cprint

_ONTONOTES5_ENGLISH_HOME = ONTONOTES5_HOME + 'files/data/english/'
_ONTONOTES5_CONLL12_ENGLISH_HOME = CONLL12_HOME + 'english/'

ONTONOTES5_CONLL12_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.conll12.jsonlines'
'''Training set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CONLL12_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.conll12.jsonlines'
'''Dev set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CONLL12_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.conll12.jsonlines'
'''Test set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''

ONTONOTES5_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.v4.jsonlines'
ONTONOTES5_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.v4.jsonlines'
ONTONOTES5_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.v4.jsonlines'

ONTONOTES5_CONLL_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.v4_gold_conll'
ONTONOTES5_CONLL_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.v4_gold_conll'
ONTONOTES5_CONLL_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.v4_gold_conll'

ONTONOTES5_POS_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.v4.pos.tsv'
ONTONOTES5_POS_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.v4.pos.tsv'
ONTONOTES5_POS_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.v4.pos.tsv'

ONTONOTES5_CON_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.con.txt'
ONTONOTES5_CON_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.con.txt'
ONTONOTES5_CON_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.con.txt'

ONTONOTES5_DEP_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.dep.conllx'
ONTONOTES5_DEP_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.dep.conllx'
ONTONOTES5_DEP_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.dep.conllx'

# ONTONOTES5_CON_ENGLISH_NOEC_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.con.noempty.txt'
# ONTONOTES5_CON_ENGLISH_NOEC_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.con.noempty.txt'
# ONTONOTES5_CON_ENGLISH_NOEC_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.con.noempty.txt'

ONTONOTES5_CONLL12_NER_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.conll12.ner.tsv'
'''Training set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CONLL12_NER_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.conll12.ner.tsv'
'''Dev set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CONLL12_NER_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.conll12.ner.tsv'
'''Test set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''

ONTONOTES5_NER_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.v4.ner.tsv'
ONTONOTES5_NER_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.v4.ner.tsv'
ONTONOTES5_NER_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.v4.ner.tsv'

try:
    get_resource(ONTONOTES5_HOME, verbose=False)
except HTTPError:
    intended_file_path = path_from_url(ONTONOTES5_HOME)
    cprint('Ontonotes 5.0 is a [red][bold]copyright[/bold][/red] dataset owned by LDC which we cannot re-distribute. '
           f'Please apply for a licence from LDC (https://catalog.ldc.upenn.edu/LDC2016T13) '
           f'then download it to {intended_file_path}')
    exit(1)

try:
    get_resource(ONTONOTES5_CONLL12_ENGLISH_TRAIN, verbose=False)
except HTTPError:
    make_gold_conll(ONTONOTES5_HOME + '..', 'english')
    make_ontonotes_language_jsonlines(CONLL12_HOME + 'v4', language='english')

batch_make_ner_tsv_if_necessary(
    [ONTONOTES5_CONLL12_ENGLISH_TRAIN, ONTONOTES5_CONLL12_ENGLISH_DEV, ONTONOTES5_CONLL12_ENGLISH_TEST])

batch_make_ner_tsv_if_necessary(
    [ONTONOTES5_ENGLISH_TRAIN, ONTONOTES5_ENGLISH_DEV, ONTONOTES5_ENGLISH_TEST])

batch_make_pos_tsv_if_necessary(
    [ONTONOTES5_ENGLISH_TRAIN, ONTONOTES5_ENGLISH_DEV, ONTONOTES5_ENGLISH_TEST])

batch_make_con_txt_if_necessary(
    [ONTONOTES5_CONLL_ENGLISH_TRAIN, ONTONOTES5_CONLL_ENGLISH_DEV, ONTONOTES5_CONLL_ENGLISH_TEST])

batch_make_dep_conllx_if_necessary(
    [ONTONOTES5_CON_ENGLISH_TRAIN, ONTONOTES5_CON_ENGLISH_DEV, ONTONOTES5_CON_ENGLISH_TEST])

# batch_remove_empty_category_if_necessary(
#     [ONTONOTES5_CON_ENGLISH_TRAIN, ONTONOTES5_CON_ENGLISH_DEV, ONTONOTES5_CON_ENGLISH_TEST])


================================================
FILE: hanlp/datasets/sts/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-20 16:25


================================================
FILE: hanlp/datasets/sts/stsb.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-20 16:25
from typing import Union, List, Callable

from hanlp.common.dataset import TransformableDataset
from hanlp.utils.io_util import read_cells

STS_B_TRAIN = 'http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz#sts-train.csv'
STS_B_DEV = 'http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz#sts-dev.csv'
STS_B_TEST = 'http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz#sts-test.csv'


class SemanticTextualSimilarityDataset(TransformableDataset):
    def __init__(self,
                 data: Union[str, List],
                 sent_a_col,
                 sent_b_col,
                 similarity_col,
                 delimiter='auto',
                 transform: Union[Callable, List] = None,
                 cache=None,
                 generate_idx=None) -> None:
        self.delimiter = delimiter
        self.similarity_col = similarity_col
        self.sent_b_col = sent_b_col
        self.sent_a_col = sent_a_col
        super().__init__(data, transform, cache, generate_idx)

    def load_file(self, filepath: str):
        for i, cells in enumerate(read_cells(filepath, strip=True, delimiter=self.delimiter)):
            yield {
                'sent_a': cells[self.sent_a_col],
                'sent_b': cells[self.sent_b_col],
                'similarity': float(cells[self.similarity_col])
            }


================================================
FILE: hanlp/datasets/tokenization/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-01 12:33

================================================
FILE: hanlp/datasets/tokenization/ctb6.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:19

_CTB6_CWS_HOME = 'http://file.hankcs.com/corpus/ctb6_cws.zip'

CTB6_CWS_TRAIN = _CTB6_CWS_HOME + '#train.txt'
'''CTB6 training set.'''
CTB6_CWS_DEV = _CTB6_CWS_HOME + '#dev.txt'
'''CTB6 dev set.'''
CTB6_CWS_TEST = _CTB6_CWS_HOME + '#test.txt'
'''CTB6 test set.'''


================================================
FILE: hanlp/datasets/tokenization/loaders/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:06


================================================
FILE: hanlp/datasets/tokenization/loaders/chunking_dataset.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-03 18:50
from typing import Union, List, Callable

from hanlp.common.dataset import TransformableDataset
from hanlp.utils.io_util import get_resource
from hanlp.utils.span_util import bmes_of
from hanlp.utils.string_util import ispunct


class ChunkingDataset(TransformableDataset):

    def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None,
                 generate_idx=None, max_seq_len=None, sent_delimiter=None) -> None:
        if not sent_delimiter:
            sent_delimiter = lambda x: ispunct(x)
        elif isinstance(sent_delimiter, str):
            sent_delimiter = set(list(sent_delimiter))
            sent_delimiter = lambda x: x in sent_delimiter
        self.sent_delimiter = sent_delimiter
        self.max_seq_len = max_seq_len
        super().__init__(data, transform, cache, generate_idx)

    def load_file(self, filepath):
        max_seq_len = self.max_seq_len
        delimiter = self.sent_delimiter
        for chars, tags in self._generate_chars_tags(filepath, delimiter, max_seq_len):
            yield {'char': chars, 'tag': tags}

    @staticmethod
    def _generate_chars_tags(filepath, delimiter, max_seq_len):
        filepath = get_resource(filepath)
        with open(filepath, encoding='utf8') as src:
            for text in src:
                chars, tags = bmes_of(text, True)
                if max_seq_len and delimiter and len(chars) > max_seq_len:
                    short_chars, short_tags = [], []
                    for idx, (char, tag) in enumerate(zip(chars, tags)):
                        short_chars.append(char)
                        short_tags.append(tag)
                        if len(short_chars) >= max_seq_len and delimiter(char):
                            yield short_chars, short_tags
                            short_chars, short_tags = [], []
                    if short_chars:
                        yield short_chars, short_tags
                else:
                    yield chars, tags


================================================
FILE: hanlp/datasets/tokenization/loaders/multi_criteria_cws/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-11 20:35

_HOME = 'https://github.com/hankcs/multi-criteria-cws/archive/naive-mix.zip#data/raw/'

CNC_TRAIN_ALL = _HOME + 'cnc/train-all.txt'
CNC_TRAIN = _HOME + 'cnc/train.txt'
CNC_DEV = _HOME + 'cnc/dev.txt'
CNC_TEST = _HOME + 'cnc/test.txt'

CTB_TRAIN_ALL = _HOME + 'ctb/train-all.txt'
CTB_TRAIN = _HOME + 'ctb/train.txt'
CTB_DEV = _HOME + 'ctb/dev.txt'
CTB_TEST = _HOME + 'ctb/test.txt'

SXU_TRAIN_ALL = _HOME + 'sxu/train-all.txt'
SXU_TRAIN = _HOME + 'sxu/train.txt'
SXU_DEV = _HOME + 'sxu/dev.txt'
SXU_TEST = _HOME + 'sxu/test.txt'

UDC_TRAIN_ALL = _HOME + 'udc/train-all.txt'
UDC_TRAIN = _HOME + 'udc/train.txt'
UDC_DEV = _HOME + 'udc/dev.txt'
UDC_TEST = _HOME + 'udc/test.txt'

WTB_TRAIN_ALL = _HOME + 'wtb/train-all.txt'
WTB_TRAIN = _HOME + 'wtb/train.txt'
WTB_DEV = _HOME + 'wtb/dev.txt'
WTB_TEST = _HOME + 'wtb/test.txt'

ZX_TRAIN_ALL = _HOME + 'zx/train-all.txt'
ZX_TRAIN = _HOME + 'zx/train.txt'
ZX_DEV = _HOME + 'zx/dev.txt'
ZX_TEST = _HOME + 'zx/test.txt'


================================================
FILE: hanlp/datasets/tokenization/loaders/multi_criteria_cws/mcws_dataset.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-10-21 19:11
import os
from typing import Union, List, Callable, Dict, Iterable

from hanlp.datasets.tokenization.loaders.txt import TextTokenizingDataset
from hanlp.utils.io_util import get_resource


class MultiCriteriaTextTokenizingDataset(TextTokenizingDataset):
    def __init__(self,
                 data: Union[str, List],
                 transform: Union[Callable, List] = None,
                 cache=None,
                 generate_idx=None,
                 delimiter=None,
                 max_seq_len=None,
                 sent_delimiter=None,
                 char_level=False,
                 hard_constraint=False) -> None:
        super().__init__(data, transform, cache, generate_idx, delimiter, max_seq_len, sent_delimiter, char_level,
                         hard_constraint)

    def should_load_file(self, data) -> bool:
        return isinstance(data, (tuple, dict))

    def load_file(self, filepath: Union[Iterable[str], Dict[str, str]]):
        """Load multi-criteria corpora specified in filepath.

        Args:
            filepath: A list of files where filename is its criterion. Or a dict of filename-criterion pairs.

        .. highlight:: bash
        .. code-block:: bash

            $ tree -L 2 .
            .
            ├── cnc
            │   ├── dev.txt
            │   ├── test.txt
            │   ├── train-all.txt
            │   └── train.txt
            ├── ctb
            │   ├── dev.txt
            │   ├── test.txt
            │   ├── train-all.txt
            │   └── train.txt
            ├── sxu
            │   ├── dev.txt
            │   ├── test.txt
            │   ├── train-all.txt
            │   └── train.txt
            ├── udc
            │   ├── dev.txt
            │   ├── test.txt
            │   ├── train-all.txt
            │   └── train.txt
            ├── wtb
            │   ├── dev.txt
            │   ├── test.txt
            │   ├── train-all.txt
            │   └── train.txt
            └── zx
                ├── dev.txt
                ├── test.txt
                ├── train-all.txt
                └── train.txt

            $ head -n 2 ctb/dev.txt
            上海 浦东 开发 与 法制 建设 同步
            新华社 上海 二月 十日 电 （ 记者 谢金虎 、 张持坚 ）

        """
        for eachpath in (filepath.items() if isinstance(filepath, dict) else filepath):
            if isinstance(eachpath, tuple):
                criteria, eachpath = eachpath
                eachpath = get_resource(eachpath)
            else:
                eachpath = get_resource(eachpath)
                criteria = os.path.basename(os.path.dirname(eachpath))
            for sample in super().load_file(eachpath):
                sample['criteria'] = criteria
                yield sample


def append_criteria_token(sample: dict, criteria_tokens: Dict[str, int], criteria_token_map: dict) -> dict:
    criteria = sample['criteria']
    token = criteria_token_map.get(criteria, None)
    if not token:
        unused_tokens = list(criteria_tokens.keys())
        size = len(criteria_token_map)
        assert size + 1 < len(unused_tokens), f'No unused token available for criteria {criteria}. ' \
                                              f'Current criteria_token_map = {criteria_token_map}'
        token = criteria_token_map[criteria] = unused_tokens[size]
    sample['token_token_type_ids'] = [0] * len(sample['token_input_ids']) + [1]
    sample['token_input_ids'] = sample['token_input_ids'] + [criteria_tokens[token]]
    return sample


================================================
FILE: hanlp/datasets/tokenization/loaders/txt.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-01 12:35
from typing import Union, List, Callable

from hanlp.common.dataset import TransformableDataset
from hanlp.utils.io_util import TimingFileIterator
from hanlp.utils.span_util import words_to_bmes, words_to_bi
from hanlp.utils.string_util import split_long_sentence_into


class TextTokenizingDataset(TransformableDataset):
    def __init__(self,
                 data: Union[str, List],
                 transform: Union[Callable, List] = None,
                 cache=None,
                 generate_idx=None,
                 delimiter=None,
                 max_seq_len=None,
                 sent_delimiter=None,
                 char_level=False,
                 hard_constraint=False,
                 ) -> None:
        """A dataset for tagging tokenization tasks.

        Args:
            data: The local or remote path to a dataset, or a list of samples where each sample is a dict.
            transform: Predefined transform(s).
            cache: ``True`` to enable caching, so that transforms won't be called twice.
            generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when
                samples are re-ordered by a sampler.
            delimiter: Delimiter between tokens used to split a line in the corpus.
            max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
            sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
                be split here.
            char_level: Whether the sequence length is measured at char level.
            hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
                in a sentence, it will be split at a token anyway.
        """
        self.hard_constraint = hard_constraint
        self.char_level = char_level
        self.sent_delimiter = sent_delimiter
        self.max_seq_len = max_seq_len
        self.delimiter = delimiter
        super().__init__(data, transform, cache, generate_idx)

    def load_file(self, filepath: str):
        """Load tokenized corpus. The format is one sentence per line, where each line consisits of tokens seperated
        by a delimiter (usually space).

        .. highlight:: bash
        .. code-block:: bash

            $ head train.txt
            上海 浦东 开发 与 法制 建设 同步
            新华社 上海 二月 十日 电 （ 记者 谢金虎 、 张持坚 ）

        Args:
            filepath: The path to the corpus.
        """
        f = TimingFileIterator(filepath)
        # longest_sent = 0
        for line in f:
            line = line.rstrip('\n')
            tokens = line.split(self.delimiter)
            if not tokens:
                continue
            if self.max_seq_len and sum(len(t) for t in tokens) > self.max_seq_len:
                # debug = []
                for short_sents in split_long_sentence_into(tokens, self.max_seq_len, self.sent_delimiter,
                                                            char_level=self.char_level,
                                                            hard_constraint=self.hard_constraint):
                    # debug.extend(short_sents)
                    # longest_sent = max(longest_sent, len(''.join(short_sents)))
                    yield {'token': short_sents}
                # assert debug == tokens
            else:
                # longest_sent = max(longest_sent, len(''.join(tokens)))
                yield {'token': tokens}
            f.log(line[:20])
        f.erase()
        # print(f'Longest sent: {longest_sent} in {filepath}')


def generate_tags_for_subtokens(sample: dict, tagging_scheme='BMES'):
    """
    Create a sequence of x for tokenization task. Each x is an atomic subtoken that will be tagged with BMES or BI tags.

    Args:
        sample: During prediction, it is a dict with 'token' being the input text, 'token_subtoken_offsets' being
         incremental offsets per each subtoken. During training, it is a dict with 'token' being a sequence of tokens,
         'token_subtoken_offsets' being non-incremental offsets per each subtoken, 'token_subtoken_offsets_group' being
         subtoken offsets grouped by each token.
        tagging_scheme:

    Returns:

    """
    # We could use token_token_span but we don't want token_token_span in the batch
    subtokens_group = sample.get('token_subtoken_offsets_group', None)
    sample['raw_token'] = sample['token']
    tokens = sample.get('token_') or sample['token']

    if subtokens_group:
        sample['token'] = subtokens_group_to_subtokens(tokens, subtokens_group)
        if tagging_scheme == 'BMES':
            sample['tag'] = words_to_bmes(subtokens_group)
        elif tagging_scheme == 'BI':
            sample['tag'] = words_to_bi(subtokens_group)
        else:
            raise NotImplementedError(f'Unsupported tagging scheme {tagging_scheme}.')
    else:
        sample['token'] = subtoken_offsets_to_subtokens(tokens, sample['token_subtoken_offsets'])
    return sample


def subtoken_offsets_to_subtokens(text, token_subtoken_offsets):
    results = []
    for b, e in token_subtoken_offsets:
        results.append(text[b:e])
    return results


def subtokens_group_to_subtokens(tokens, subtoken_offsets_group):
    results = []
    for subtoken_offsets, token in zip(subtoken_offsets_group, tokens):
        for b, e in subtoken_offsets:
            results.append(token[b:e])
    return results


================================================
FILE: hanlp/datasets/tokenization/sighan2005/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:42
import os

from hanlp.utils.io_util import get_resource, split_file
from hanlp.utils.log_util import logger

SIGHAN2005 = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip'


def make(train):
    root = get_resource(SIGHAN2005)
    train = os.path.join(root, train.split('#')[-1])
    if not os.path.isfile(train):
        full = train.replace('_90.txt', '.utf8')
        logger.info(f'Splitting {full} into training set and valid set with 9:1 proportion')
        valid = train.replace('90.txt', '10.txt')
        split_file(full, train=0.9, dev=0.1, test=0, names={'train': train, 'dev': valid})
        assert os.path.isfile(train), f'Failed to make {train}'
        assert os.path.isfile(valid), f'Failed to make {valid}'
        logger.info(f'Successfully made {train} {valid}')


================================================
FILE: hanlp/datasets/tokenization/sighan2005/as_.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:42
from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make

SIGHAN2005_AS_DICT = SIGHAN2005 + "#" + "gold/as_training_words.utf8"
'''Dictionary built on trainings set.'''
SIGHAN2005_AS_TRAIN_ALL = SIGHAN2005 + "#" + "training/as_training.utf8"
'''Full training set.'''
SIGHAN2005_AS_TRAIN = SIGHAN2005 + "#" + "training/as_training_90.txt"
'''Training set (first 90% of the full official training set).'''
SIGHAN2005_AS_DEV = SIGHAN2005 + "#" + "training/as_training_10.txt"
'''Dev set (last 10% of full official training set).'''
SIGHAN2005_AS_TEST_INPUT = SIGHAN2005 + "#" + "testing/as_testing.utf8"
'''Test input.'''
SIGHAN2005_AS_TEST = SIGHAN2005 + "#" + "gold/as_testing_gold.utf8"
'''Test set.'''

make(SIGHAN2005_AS_TRAIN)


================================================
FILE: hanlp/datasets/tokenization/sighan2005/cityu.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:42
from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make

SIGHAN2005_CITYU_DICT = SIGHAN2005 + "#" + "gold/cityu_training_words.utf8"
'''Dictionary built on trainings set.'''
SIGHAN2005_CITYU_TRAIN_ALL = SIGHAN2005 + "#" + "training/cityu_training.utf8"
'''Full training set.'''
SIGHAN2005_CITYU_TRAIN = SIGHAN2005 + "#" + "training/cityu_training_90.txt"
'''Training set (first 90% of the full official training set).'''
SIGHAN2005_CITYU_DEV = SIGHAN2005 + "#" + "training/cityu_training_10.txt"
'''Dev set (last 10% of full official training set).'''
SIGHAN2005_CITYU_TEST_INPUT = SIGHAN2005 + "#" + "testing/cityu_test.utf8"
'''Test input.'''
SIGHAN2005_CITYU_TEST = SIGHAN2005 + "#" + "gold/cityu_test_gold.utf8"
'''Test set.'''

make(SIGHAN2005_CITYU_TRAIN)


================================================
FILE: hanlp/datasets/tokenization/sighan2005/msr.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:42
from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make

SIGHAN2005_MSR_DICT = SIGHAN2005 + "#" + "gold/msr_training_words.utf8"
'''Dictionary built on trainings set.'''
SIGHAN2005_MSR_TRAIN_ALL = SIGHAN2005 + "#" + "training/msr_training.utf8"
'''Full training set.'''
SIGHAN2005_MSR_TRAIN = SIGHAN2005 + "#" + "training/msr_training_90.txt"
'''Training set (first 90% of the full official training set).'''
SIGHAN2005_MSR_DEV = SIGHAN2005 + "#" + "training/msr_training_10.txt"
'''Dev set (last 10% of full official training set).'''
SIGHAN2005_MSR_TEST_INPUT = SIGHAN2005 + "#" + "testing/msr_test.utf8"
'''Test input.'''
SIGHAN2005_MSR_TEST = SIGHAN2005 + "#" + "gold/msr_test_gold.utf8"
'''Test set.'''

make(SIGHAN2005_MSR_TRAIN)


================================================
FILE: hanlp/datasets/tokenization/sighan2005/pku.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:42
from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make

SIGHAN2005_PKU_DICT = SIGHAN2005 + "#" + "gold/pku_training_words.utf8"
'''Dictionary built on trainings set.'''
SIGHAN2005_PKU_TRAIN_ALL = SIGHAN2005 + "#" + "training/pku_training.utf8"
'''Full training set.'''
SIGHAN2005_PKU_TRAIN = SIGHAN2005 + "#" + "training/pku_training_90.txt"
'''Training set (first 90% of the full official training set).'''
SIGHAN2005_PKU_DEV = SIGHAN2005 + "#" + "training/pku_training_10.txt"
'''Dev set (last 10% of full official training set).'''
SIGHAN2005_PKU_TEST_INPUT = SIGHAN2005 + "#" + "testing/pku_test.utf8"
'''Test input.'''
SIGHAN2005_PKU_TEST = SIGHAN2005 + "#" + "gold/pku_test_gold.utf8"
'''Test set.'''

make(SIGHAN2005_PKU_TRAIN)


================================================
FILE: hanlp/layers/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-26 00:50

================================================
FILE: hanlp/layers/cnn_encoder.py
================================================
from typing import Optional, Tuple

import torch
from torch.nn import Conv1d, Linear


class CnnEncoder(torch.nn.Module):
    """
    A `CnnEncoder` is a combination of multiple convolution layers and max pooling layers.  As a
    [`Seq2VecEncoder`](./seq2vec_encoder.md), the input to this module is of shape `(batch_size, num_tokens,
    input_dim)`, and the output is of shape `(batch_size, output_dim)`.

    The CNN has one convolution layer for each ngram filter size. Each convolution operation gives
    out a vector of size num_filters. The number of times a convolution layer will be used
    is `num_tokens - ngram_size + 1`. The corresponding maxpooling layer aggregates all these
    outputs from the convolution layer and outputs the max.

    This operation is repeated for every ngram size passed, and consequently the dimensionality of
    the output after maxpooling is `len(ngram_filter_sizes) * num_filters`.  This then gets
    (optionally) projected down to a lower dimensional output, specified by `output_dim`.

    We then use a fully connected layer to project in back to the desired output_dim.  For more
    details, refer to "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural
    Networks for Sentence Classification", Zhang and Wallace 2016, particularly Figure 1.

    Registered as a `Seq2VecEncoder` with name "cnn".

    # Parameters

    embedding_dim : `int`, required
        This is the input dimension to the encoder.  We need this because we can't do shape
        inference in pytorch, and we need to know what size filters to construct in the CNN.
    num_filters : `int`, required
        This is the output dim for each convolutional layer, which is the number of "filters"
        learned by that layer.
    ngram_filter_sizes : `Tuple[int]`, optional (default=`(2, 3, 4, 5)`)
        This specifies both the number of convolutional layers we will create and their sizes.  The
        default of `(2, 3, 4, 5)` will have four convolutional layers, corresponding to encoding
        ngrams of size 2 to 5 with some number of filters.
    conv_layer_activation : `Activation`, optional (default=`torch.nn.ReLU`)
        Activation to use after the convolution layers.
    output_dim : `Optional[int]`, optional (default=`None`)
        After doing convolutions and pooling, we'll project the collected features into a vector of
        this size.  If this value is `None`, we will just return the result of the max pooling,
        giving an output of shape `len(ngram_filter_sizes) * num_filters`.
    """

    def __init__(
        self,
        embedding_dim: int,
        num_filters: int,
        ngram_filter_sizes: Tuple[int, ...] = (2, 3, 4, 5),
        conv_layer_activation: str = 'ReLU',
        output_dim: Optional[int] = None,
    ) -> None:
        super().__init__()
        self._embedding_dim = embedding_dim
        self._num_filters = num_filters
        self._ngram_filter_sizes = ngram_filter_sizes
        self._activation = getattr(torch.nn, conv_layer_activation)()
        self._output_dim = output_dim

        self._convolution_layers = [
            Conv1d(
                in_channels=self._embedding_dim,
                out_channels=self._num_filters,
                kernel_size=ngram_size,
            )
            for ngram_size in self._ngram_filter_sizes
        ]
        for i, conv_layer in enumerate(self._convolution_layers):
            self.add_module("conv_layer_%d" % i, conv_layer)

        maxpool_output_dim = self._num_filters * len(self._ngram_filter_sizes)
        if self._output_dim:
            self.projection_layer = Linear(maxpool_output_dim, self._output_dim)
        else:
            self.projection_layer = None
            self._output_dim = maxpool_output_dim

    def get_input_dim(self) -> int:
        return self._embedding_dim

    def get_output_dim(self) -> int:
        return self._output_dim

    def forward(self, tokens: torch.Tensor, mask: torch.BoolTensor):
        if mask is not None:
            tokens = tokens * mask.unsqueeze(-1)

        # Our input is expected to have shape `(batch_size, num_tokens, embedding_dim)`.  The
        # convolution layers expect input of shape `(batch_size, in_channels, sequence_length)`,
        # where the conv layer `in_channels` is our `embedding_dim`.  We thus need to transpose the
        # tensor first.
        tokens = torch.transpose(tokens, 1, 2)
        # Each convolution layer returns output of size `(batch_size, num_filters, pool_length)`,
        # where `pool_length = num_tokens - ngram_size + 1`.  We then do an activation function,
        # then do max pooling over each filter for the whole input sequence.  Because our max
        # pooling is simple, we just use `torch.max`.  The resultant tensor of has shape
        # `(batch_size, num_conv_layers * num_filters)`, which then gets projected using the
        # projection layer, if requested.

        filter_outputs = []
        for i in range(len(self._convolution_layers)):
            convolution_layer = getattr(self, "conv_layer_{}".format(i))
            filter_outputs.append(self._activation(convolution_layer(tokens)).max(dim=2)[0])

        # Now we have a list of `num_conv_layers` tensors of shape `(batch_size, num_filters)`.
        # Concatenating them gives us a tensor of shape `(batch_size, num_filters * num_conv_layers)`.
        maxpool_output = (
            torch.cat(filter_outputs, dim=1) if len(filter_outputs) > 1 else filter_outputs[0]
        )

        if self.projection_layer:
            result = self.projection_layer(maxpool_output)
        else:
            result = maxpool_output
        return result


================================================
FILE: hanlp/layers/crf/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-18 22:55

================================================
FILE: hanlp/layers/crf/crf.py
================================================
# Copied from https://github.com/kmkurn/pytorch-crf
# Copyright 2017 Kemal Kurniawan <kemal@kkurniawan.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# of the Software, and to permit persons to whom the Software is furnished to do
# so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
__version__ = '0.7.2'

from typing import List, Optional

import torch
import torch.nn as nn


class CRF(nn.Module):
    """Conditional random field.

    This module implements a conditional random field [LMP01]_. The forward computation
    of this class computes the log likelihood of the given sequence of tags and
    emission score tensor. This class also has `~CRF.decode` method which finds
    the best tag sequence given an emission score tensor using `Viterbi algorithm`_.

    Args:
        num_tags: Number of tags.
        batch_first: Whether the first dimension corresponds to the size of a minibatch.

    Attributes:
        start_transitions (`~torch.nn.Parameter`): Start transition score tensor of size
            ``(num_tags,)``.
        end_transitions (`~torch.nn.Parameter`): End transition score tensor of size
            ``(num_tags,)``.
        transitions (`~torch.nn.Parameter`): Transition score tensor of size
            ``(num_tags, num_tags)``.


    .. [LMP01] Lafferty, J., McCallum, A., Pereira, F. (2001).
       "Conditional random fields: Probabilistic models for segmenting and
       labeling sequence data". *Proc. 18th International Conf. on Machine
       Learning*. Morgan Kaufmann. pp. 282–289.

    .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
    """

    def __init__(self, num_tags: int, batch_first: bool = True) -> None:
        if num_tags <= 0:
            raise ValueError(f'invalid number of tags: {num_tags}')
        super().__init__()
        self.num_tags = num_tags
        self.batch_first = batch_first
        self.start_transitions = nn.Parameter(torch.empty(num_tags))
        self.end_transitions = nn.Parameter(torch.empty(num_tags))
        self.transitions = nn.Parameter(torch.empty(num_tags, num_tags))

        self.reset_parameters()

    def reset_parameters(self) -> None:
        """Initialize the transition parameters.

        The parameters will be initialized randomly from a uniform distribution
        between -0.1 and 0.1.
        """
        nn.init.uniform_(self.start_transitions, -0.1, 0.1)
        nn.init.uniform_(self.end_transitions, -0.1, 0.1)
        nn.init.uniform_(self.transitions, -0.1, 0.1)

    def __repr__(self) -> str:
        return f'{self.__class__.__name__}(num_tags={self.num_tags})'

    def forward(
            self,
            emissions: torch.Tensor,
            tags: torch.LongTensor,
            mask: Optional[torch.ByteTensor] = None,
            reduction: str = 'sum',
    ) -> torch.Tensor:
        """Compute the conditional log likelihood of a sequence of tags given emission scores.

        Args:
            emissions (`~torch.Tensor`): Emission score tensor of size
                ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
                ``(batch_size, seq_length, num_tags)`` otherwise.
            tags (`~torch.LongTensor`): Sequence of tags tensor of size
                ``(seq_length, batch_size)`` if ``batch_first`` is ``False``,
                ``(batch_size, seq_length)`` otherwise.
            mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
                if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
            reduction: Specifies  the reduction to apply to the output:
                ``none|sum|mean|token_mean``. ``none``: no reduction will be applied.
                ``sum``: the output will be summed over batches. ``mean``: the output will be
                averaged over batches. ``token_mean``: the output will be averaged over tokens.

        Returns:
            `~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if
            reduction is ``none``, ``()`` otherwise.
        """
        self._validate(emissions, tags=tags, mask=mask)
        if reduction not in ('none', 'sum', 'mean', 'token_mean'):
            raise ValueError(f'invalid reduction: {reduction}')
        if mask is None:
            mask = torch.ones_like(tags, dtype=torch.uint8)

        if self.batch_first:
            emissions = emissions.transpose(0, 1)
            tags = tags.transpose(0, 1)
            mask = mask.transpose(0, 1)

        # shape: (batch_size,)
        numerator = self._compute_score(emissions, tags, mask)
        # shape: (batch_size,)
        denominator = self._compute_normalizer(emissions, mask)
        # shape: (batch_size,)
        llh = numerator - denominator

        if reduction == 'none':
            return llh
        if reduction == 'sum':
            return llh.sum()
        if reduction == 'mean':
            return llh.mean()
        assert reduction == 'token_mean'
        return llh.sum() / mask.type_as(emissions).sum()

    def decode(self, emissions: torch.Tensor,
               mask: Optional[torch.ByteTensor] = None) -> List[List[int]]:
        """Find the most likely tag sequence using Viterbi algorithm.

        Args:
            emissions (`~torch.Tensor`): Emission score tensor of size
                ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
                ``(batch_size, seq_length, num_tags)`` otherwise.
            mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
                if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.

        Returns:
            List of list containing the best tag sequence for each batch.
        """
        self._validate(emissions, mask=mask)
        if mask is None:
            mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8)

        if self.batch_first:
            emissions = emissions.transpose(0, 1)
            mask = mask.transpose(0, 1)

        return self._viterbi_decode(emissions, mask)

    def _validate(
            self,
            emissions: torch.Tensor,
            tags: Optional[torch.LongTensor] = None,
            mask: Optional[torch.ByteTensor] = None) -> None:
        if emissions.dim() != 3:
            raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}')
        if emissions.size(2) != self.num_tags:
            raise ValueError(
                f'expected last dimension of emissions is {self.num_tags}, '
                f'got {emissions.size(2)}')

        if tags is not None:
            if emissions.shape[:2] != tags.shape:
                raise ValueError(
                    'the first two dimensions of emissions and tags must match, '
                    f'got {tuple(emissions.shape[:2])} and {tuple(tags.shape)}')

        if mask is not None:
            if emissions.shape[:2] != mask.shape:
                raise ValueError(
                    'the first two dimensions of emissions and mask must match, '
                    f'got {tuple(emissions.shape[:2])} and {tuple(mask.shape)}')
            no_empty_seq = not self.batch_first and mask[0].all()
            no_empty_seq_bf = self.batch_first and mask[:, 0].all()
            if not no_empty_seq and not no_empty_seq_bf:
                raise ValueError('mask of the first timestep must all be on')

    def _compute_score(
            self, emissions: torch.Tensor, tags: torch.LongTensor,
            mask: torch.ByteTensor) -> torch.Tensor:
        # emissions: (seq_length, batch_size, num_tags)
        # tags: (seq_length, batch_size)
        # mask: (seq_length, batch_size)
        assert emissions.dim() == 3 and tags.dim() == 2
        assert emissions.shape[:2] == tags.shape
        assert emissions.size(2) == self.num_tags
        assert mask.shape == tags.shape
        assert mask[0].all()

        seq_length, batch_size = tags.shape
        mask = mask.type_as(emissions)

        # Start transition score and first emission
        # shape: (batch_size,)
        score = self.start_transitions[tags[0]]
        score += emissions[0, torch.arange(batch_size), tags[0]]

        for i in range(1, seq_length):
            # Transition score to next tag, only added if next timestep is valid (mask == 1)
            # shape: (batch_size,)
            score += self.transitions[tags[i - 1], tags[i]] * mask[i]

            # Emission score for next tag, only added if next timestep is valid (mask == 1)
            # shape: (batch_size,)
            score += emissions[i, torch.arange(batch_size), tags[i]] * mask[i]

        # End transition score
        # shape: (batch_size,)
        seq_ends = mask.long().sum(dim=0) - 1
        # shape: (batch_size,)
        last_tags = tags[seq_ends, torch.arange(batch_size)]
        # shape: (batch_size,)
        score += self.end_transitions[last_tags]

        return score

    def _compute_normalizer(
            self, emissions: torch.Tensor, mask: torch.ByteTensor) -> torch.Tensor:
        # emissions: (seq_length, batch_size, num_tags)
        # mask: (seq_length, batch_size)
        assert emissions.dim() == 3 and mask.dim() == 2
        assert emissions.shape[:2] == mask.shape
        assert emissions.size(2) == self.num_tags
        assert mask[0].all()

        seq_length = emissions.size(0)

        # Start transition score and first emission; score has size of
        # (batch_size, num_tags) where for each batch, the j-th column stores
        # the score that the first timestep has tag j
        # shape: (batch_size, num_tags)
        score = self.start_transitions + emissions[0]

        for i in range(1, seq_length):
            # Broadcast score for every possible next tag
            # shape: (batch_size, num_tags, 1)
            broadcast_score = score.unsqueeze(2)

            # Broadcast emission score for every possible current tag
            # shape: (batch_size, 1, num_tags)
            broadcast_emissions = emissions[i].unsqueeze(1)

            # Compute the score tensor of size (batch_size, num_tags, num_tags) where
            # for each sample, entry at row i and column j stores the sum of scores of all
            # possible tag sequences so far that end with transitioning from tag i to tag j
            # and emitting
            # shape: (batch_size, num_tags, num_tags)
            next_score = broadcast_score + self.transitions + broadcast_emissions

            # Sum over all possible current tags, but we're in score space, so a sum
            # becomes a log-sum-exp: for each sample, entry i stores the sum of scores of
            # all possible tag sequences so far, that end in tag i
            # shape: (batch_size, num_tags)
            next_score = torch.logsumexp(next_score, dim=1)

            # Set score to the next score if this timestep is valid (mask == 1)
            # shape: (batch_size, num_tags)
            score = torch.where(mask[i].unsqueeze(1), next_score, score)

        # End transition score
        # shape: (batch_size, num_tags)
        score += self.end_transitions

        # Sum (log-sum-exp) over all possible tags
        # shape: (batch_size,)
        return torch.logsumexp(score, dim=1)

    def _viterbi_decode(self, emissions: torch.FloatTensor,
                        mask: torch.ByteTensor) -> List[List[int]]:
        # emissions: (seq_length, batch_size, num_tags)
        # mask: (seq_length, batch_size)
        assert emissions.dim() == 3 and mask.dim() == 2
        assert emissions.shape[:2] == mask.shape
        assert emissions.size(2) == self.num_tags
        assert mask[0].all()

        seq_length, batch_size = mask.shape

        # Start transition and first emission
        # shape: (batch_size, num_tags)
        score = self.start_transitions + emissions[0]
        history = []

        # score is a tensor of size (batch_size, num_tags) where for every batch,
        # value at column j stores the score of the best tag sequence so far that ends
        # with tag j
        # history saves where the best tags candidate transitioned from; this is used
        # when we trace back the best tag sequence

        # Viterbi algorithm recursive case: we compute the score of the best tag sequence
        # for every possible next tag
        for i in range(1, seq_length):
            # Broadcast viterbi score for every possible next tag
            # shape: (batch_size, num_tags, 1)
            broadcast_score = score.unsqueeze(2)

            # Broadcast emission score for every possible current tag
            # shape: (batch_size, 1, num_tags)
            broadcast_emission = emissions[i].unsqueeze(1)

            # Compute the score tensor of size (batch_size, num_tags, num_tags) where
            # for each sample, entry at row i and column j stores the score of the best
            # tag sequence so far that ends with transitioning from tag i to tag j and emitting
            # shape: (batch_size, num_tags, num_tags)
            next_score = broadcast_score + self.transitions + broadcast_emission

            # Find the maximum score over all possible current tag
            # shape: (batch_size, num_tags)
            next_score, indices = next_score.max(dim=1)

            # Set score to the next score if this timestep is valid (mask == 1)
            # and save the index that produces the next score
            # shape: (batch_size, num_tags)
            score = torch.where(mask[i].unsqueeze(1), next_score, score)
            history.append(indices)

        # End transition score
        # shape: (batch_size, num_tags)
        score += self.end_transitions

        # Now, compute the best path for each sample

        # shape: (batch_size,)
        seq_ends = mask.long().sum(dim=0) - 1
        best_tags_list = []

        for idx in range(batch_size):
            # Find the tag which maximizes the score at the last timestep; this is our best tag
            # for the last timestep
            _, best_last_tag = score[idx].max(dim=0)
            best_tags = [best_last_tag.item()]

            # We trace back where the best last tag comes from, append that to our best tag
            # sequence, and trace it back again, and so on
            for hist in reversed(history[:seq_ends[idx]]):
                best_last_tag = hist[idx][best_tags[-1]]
                best_tags.append(best_last_tag.item())

            # Reverse the order because we start from the last timestep
            best_tags.reverse()
            best_tags_list.append(best_tags)

        return best_tags_list


================================================
FILE: hanlp/layers/crf/crf_layer_tf.py
================================================
# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import tensorflow as tf

from hanlp.layers.crf.crf_tf import crf_decode, crf_log_likelihood


class CRF(tf.keras.layers.Layer):
    """Conditional Random Field layer (tf.keras)
    `CRF` can be used as the last layer in a network (as a classifier). Input shape (features)
    must be equal to the number of classes the CRF can predict (a linear layer is recommended).
    
    Note: the loss and accuracy functions of networks using `CRF` must
    use the provided loss and accuracy functions (denoted as loss and viterbi_accuracy)
    as the classification of sequences are used with the layers internal weights.
    
    Copyright: this is a modified version of
    https://github.com/NervanaSystems/nlp-architect/blob/master/nlp_architect/nn/tensorflow/python/keras/layers/crf.py

    Args:
      num_labels(int): the number of labels to tag each temporal input.
    Input shape:
      num_labels(int): the number of labels to tag each temporal input.
    Input shape:
    nD tensor with shape `(batch_size, sentence length, num_classes)`.
    Output shape:
      nD tensor with shape: `(batch_size, sentence length, num_classes)`.

    Returns:

    """

    def __init__(self, num_classes, **kwargs):
        self.transitions = None
        super(CRF, self).__init__(**kwargs)
        # num of output labels
        self.output_dim = int(num_classes)
        self.input_spec = tf.keras.layers.InputSpec(min_ndim=3)
        self.supports_masking = False
        sequence_lengths = None

    def get_config(self):
        config = {
            'output_dim': self.output_dim,
            'supports_masking': self.supports_masking,
            'transitions': tf.keras.backend.eval(self.transitions)
        }
        base_config = super(CRF, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def build(self, input_shape):
        assert len(input_shape) == 3
        f_shape = tf.TensorShape(input_shape)
        input_spec = tf.keras.layers.InputSpec(min_ndim=3, axes={-1: f_shape[-1]})

        if f_shape[-1] is None:
            raise ValueError('The last dimension of the inputs to `CRF` '
                             'should be defined. Found `None`.')
        if f_shape[-1] != self.output_dim:
            raise ValueError('The last dimension of the input shape must be equal to output'
                             ' shape. Use a linear layer if needed.')
        self.input_spec = input_spec
        self.transitions = self.add_weight(name='transitions',
                                           shape=[self.output_dim, self.output_dim],
                                           initializer='glorot_uniform',
                                           trainable=True)
        self.built = True

    def compute_mask(self, inputs, mask=None):
        # Just pass the received mask from previous layer, to the next layer or
        # manipulate it if this layer changes the shape of the input
        return mask

    # pylint: disable=arguments-differ
    def call(self, inputs, sequence_lengths=None, mask=None, training=None, **kwargs):
        sequences = tf.convert_to_tensor(inputs, dtype=self.dtype)
        if sequence_lengths is not None:
            assert len(sequence_lengths.shape) == 2
            assert tf.convert_to_tensor(sequence_lengths).dtype == 'int32'
            seq_len_shape = tf.convert_to_tensor(sequence_lengths).get_shape().as_list()
            assert seq_len_shape[1] == 1
            sequence_lengths = tf.keras.backend.flatten(sequence_lengths)
        else:
            sequence_lengths = tf.math.count_nonzero(mask, axis=1)

        viterbi_sequence, _ = crf_decode(sequences, self.transitions,
                                         sequence_lengths)
        output = tf.keras.backend.one_hot(viterbi_sequence, self.output_dim)
        return tf.keras.backend.in_train_phase(sequences, output)

    # def loss(self, y_true, y_pred):
    #     y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype)
    #     log_likelihood, self.transitions = \
    #         crf_log_likelihood(y_pred,
    #                            tf.cast(y_true, dtype=tf.int32),
    #                            sequence_lengths,
    #                            transition_params=self.transitions)
    #     return tf.reduce_mean(-log_likelihood)

    def compute_output_shape(self, input_shape):
        tf.TensorShape(input_shape).assert_has_rank(3)
        return input_shape[:2] + (self.output_dim,)

    @property
    def viterbi_accuracy(self):
        def accuracy(y_true, y_pred):
            shape = tf.shape(y_pred)
            sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1])
            viterbi_sequence, _ = crf_decode(y_pred, self.transitions,
                                             sequence_lengths)
            output = tf.keras.backend.one_hot(viterbi_sequence, self.output_dim)
            return tf.keras.metrics.categorical_accuracy(y_true, output)

        accuracy.func_name = 'viterbi_accuracy'
        return accuracy


class CRFLoss(object):

    def __init__(self, crf: CRF, dtype) -> None:
        super().__init__()
        self.crf = crf
        self.dtype = dtype
        self.__name__ = type(self).__name__

    def __call__(self, y_true, y_pred, sample_weight=None, **kwargs):
        assert sample_weight is not None, 'your model has to support masking'
        if len(y_true.shape) == 3:
            y_true = tf.argmax(y_true, axis=-1)
        sequence_lengths = tf.math.count_nonzero(sample_weight, axis=1)
        y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype)
        log_likelihood, self.crf.transitions = \
            crf_log_likelihood(y_pred,
                               tf.cast(y_true, dtype=tf.int32),
                               sequence_lengths,
                               transition_params=self.crf.transitions)
        return tf.reduce_mean(-log_likelihood)


class CRFWrapper(tf.keras.Model):
    def __init__(self, model: tf.keras.Model, num_classes=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model = model
        self.crf = CRF(model.output.shape[-1] if not num_classes else num_classes)

    def call(self, inputs, training=None, mask=None):
        output = self.model(inputs, training=training, mask=mask)
        viterbi_output = self.crf(output)
        return viterbi_output

    def compute_output_shape(self, input_shape):
        return self.model.compute_output_shape(input_shape)


================================================
FILE: hanlp/layers/crf/crf_tf.py
================================================
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import tensorflow as tf

# TODO: Wrap functions in @tf.function once
# https://github.com/tensorflow/tensorflow/issues/29075 is resolved


def crf_sequence_score(inputs, tag_indices, sequence_lengths,
                       transition_params):
    """Computes the unnormalized score for a tag sequence.

    Args:
      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
    to use as input to the CRF layer.
      tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which
    we compute the unnormalized score.
      sequence_lengths: A [batch_size] vector of true sequence lengths.
      transition_params: 

    Returns:
      sequence_scores: A [batch_size] vector of unnormalized sequence scores.

    """
    tag_indices = tf.cast(tag_indices, dtype=tf.int32)
    sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)

    # If max_seq_len is 1, we skip the score calculation and simply gather the
    # unary potentials of the single tag.
    def _single_seq_fn():
        batch_size = tf.shape(inputs, out_type=tag_indices.dtype)[0]

        example_inds = tf.reshape(
            tf.range(batch_size, dtype=tag_indices.dtype), [-1, 1])
        sequence_scores = tf.gather_nd(
            tf.squeeze(inputs, [1]),
            tf.concat([example_inds, tag_indices], axis=1))
        sequence_scores = tf.where(
            tf.less_equal(sequence_lengths, 0), tf.zeros_like(sequence_scores),
            sequence_scores)
        return sequence_scores

    def _multi_seq_fn():
        # Compute the scores of the given tag sequence.
        unary_scores = crf_unary_score(tag_indices, sequence_lengths, inputs)
        binary_scores = crf_binary_score(tag_indices, sequence_lengths,
                                         transition_params)
        sequence_scores = unary_scores + binary_scores
        return sequence_scores

    if inputs.shape[1] == 1:
        return _single_seq_fn()
    else:
        return _multi_seq_fn()


def crf_multitag_sequence_score(inputs, tag_bitmap, sequence_lengths,
                                transition_params):
    """Computes the unnormalized score of all tag sequences matching
    tag_bitmap.
    
    tag_bitmap enables more than one tag to be considered correct at each time
    step. This is useful when an observed output at a given time step is
    consistent with more than one tag, and thus the log likelihood of that
    observation must take into account all possible consistent tags.
    
    Using one-hot vectors in tag_bitmap gives results identical to
    crf_sequence_score.

    Args:
      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
    to use as input to the CRF layer.
      tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor
    representing all active tags at each index for which to calculate the
    unnormalized score.
      sequence_lengths: A [batch_size] vector of true sequence lengths.
      transition_params: 

    Returns:
      sequence_scores: A [batch_size] vector of unnormalized sequence scores.

    """
    tag_bitmap = tf.cast(tag_bitmap, dtype=tf.bool)
    sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
    filtered_inputs = tf.where(tag_bitmap, inputs,
                               tf.fill(tf.shape(inputs), float("-inf")))

    # If max_seq_len is 1, we skip the score calculation and simply gather the
    # unary potentials of all active tags.
    def _single_seq_fn():
        return tf.reduce_logsumexp(
            filtered_inputs, axis=[1, 2], keepdims=False)

    def _multi_seq_fn():
        # Compute the logsumexp of all scores of sequences matching the given tags.
        return crf_log_norm(
            inputs=filtered_inputs,
            sequence_lengths=sequence_lengths,
            transition_params=transition_params)

    if inputs.shape[1] == 1:
        return _single_seq_fn()
    else:
        return _multi_seq_fn()


def crf_log_norm(inputs, sequence_lengths, transition_params):
    """Computes the normalization for a CRF.

    Args:
      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
    to use as input to the CRF layer.
      sequence_lengths: A [batch_size] vector of true sequence lengths.
      transition_params: 

    Returns:
      log_norm: A [batch_size] vector of normalizers for a CRF.

    """
    sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
    # Split up the first and rest of the inputs in preparation for the forward
    # algorithm.
    first_input = tf.slice(inputs, [0, 0, 0], [-1, 1, -1])
    first_input = tf.squeeze(first_input, [1])

    # If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp over
    # the "initial state" (the unary potentials).
    def _single_seq_fn():
        log_norm = tf.reduce_logsumexp(first_input, [1])
        # Mask `log_norm` of the sequences with length <= zero.
        log_norm = tf.where(
            tf.less_equal(sequence_lengths, 0), tf.zeros_like(log_norm),
            log_norm)
        return log_norm

    def _multi_seq_fn():
        """Forward computation of alpha values."""
        rest_of_input = tf.slice(inputs, [0, 1, 0], [-1, -1, -1])
        # Compute the alpha values in the forward algorithm in order to get the
        # partition function.

        alphas = crf_forward(rest_of_input, first_input, transition_params,
                             sequence_lengths)
        log_norm = tf.reduce_logsumexp(alphas, [1])
        # Mask `log_norm` of the sequences with length <= zero.
        log_norm = tf.where(
            tf.less_equal(sequence_lengths, 0), tf.zeros_like(log_norm),
            log_norm)
        return log_norm

    if inputs.shape[1] == 1:
        return _single_seq_fn()
    else:
        return _multi_seq_fn()


def crf_log_likelihood(inputs,
                       tag_indices,
                       sequence_lengths,
                       transition_params=None):
    """Computes the log-likelihood of tag sequences in a CRF.

    Args:
      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
    to use as input to the CRF layer.
      tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which
    we compute the log-likelihood.
      sequence_lengths: A [batch_size] vector of true sequence lengths.
      transition_params: A [num_tags, num_tags] transition matrix, (Default value = None)

    Returns:
      log_likelihood: A [batch_size] `Tensor` containing the log-likelihood of
      each example, given the sequence of tag indices.
      transition_params: A [num_tags, num_tags] transition matrix. This is
      either provided by the caller or created in this function.

    """
    num_tags = inputs.shape[2]

    # cast type to handle different types
    tag_indices = tf.cast(tag_indices, dtype=tf.int32)
    sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)

    if transition_params is None:
        initializer = tf.keras.initializers.GlorotUniform()
        transition_params = tf.Variable(
            initializer([num_tags, num_tags]), "transitions")

    sequence_scores = crf_sequence_score(inputs, tag_indices, sequence_lengths,
                                         transition_params)
    log_norm = crf_log_norm(inputs, sequence_lengths, transition_params)

    # Normalize the scores to get the log-likelihood per example.
    log_likelihood = sequence_scores - log_norm
    return log_likelihood, transition_params


def crf_unary_score(tag_indices, sequence_lengths, inputs):
    """Computes the unary scores of tag sequences.

    Args:
      tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
      sequence_lengths: A [batch_size] vector of true sequence lengths.
      inputs: 

    Returns:
      unary_scores: A [batch_size] vector of unary scores.

    """
    assert len(tag_indices.shape) == 2, 'tag_indices: A [batch_size, max_seq_len] matrix of tag indices.'
    tag_indices = tf.cast(tag_indices, dtype=tf.int32)
    sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)

    batch_size = tf.shape(inputs)[0]
    max_seq_len = tf.shape(inputs)[1]
    num_tags = tf.shape(inputs)[2]

    flattened_inputs = tf.reshape(inputs, [-1])

    offsets = tf.expand_dims(tf.range(batch_size) * max_seq_len * num_tags, 1)
    offsets += tf.expand_dims(tf.range(max_seq_len) * num_tags, 0)
    # Use int32 or int64 based on tag_indices' dtype.
    if tag_indices.dtype == tf.int64:
        offsets = tf.cast(offsets, tf.int64)
    flattened_tag_indices = tf.reshape(offsets + tag_indices, [-1])

    unary_scores = tf.reshape(
        tf.gather(flattened_inputs, flattened_tag_indices),
        [batch_size, max_seq_len])

    masks = tf.sequence_mask(
        sequence_lengths, maxlen=tf.shape(tag_indices)[1], dtype=tf.float32)

    unary_scores = tf.reduce_sum(unary_scores * masks, 1)
    return unary_scores


def crf_binary_score(tag_indices, sequence_lengths, transition_params):
    """Computes the binary scores of tag sequences.

    Args:
      tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
      sequence_lengths: A [batch_size] vector of true sequence lengths.
      transition_params: 

    Returns:
      binary_scores: A [batch_size] vector of binary scores.

    """
    tag_indices = tf.cast(tag_indices, dtype=tf.int32)
    sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)

    num_tags = tf.shape(transition_params)[0]
    num_transitions = tf.shape(tag_indices)[1] - 1

    # Truncate by one on each side of the sequence to get the start and end
    # indices of each transition.
    start_tag_indices = tf.slice(tag_indices, [0, 0], [-1, num_transitions])
    end_tag_indices = tf.slice(tag_indices, [0, 1], [-1, num_transitions])

    # Encode the indices in a flattened representation.
    flattened_transition_indices = start_tag_indices * \
        num_tags + end_tag_indices
    flattened_transition_params = tf.reshape(transition_params, [-1])

    # Get the binary scores based on the flattened representation.
    binary_scores = tf.gather(flattened_transition_params,
                              flattened_transition_indices)

    masks = tf.sequence_mask(
        sequence_lengths, maxlen=tf.shape(tag_indices)[1], dtype=tf.float32)
    truncated_masks = tf.slice(masks, [0, 1], [-1, -1])
    binary_scores = tf.reduce_sum(binary_scores * truncated_masks, 1)
    return binary_scores


def crf_forward(inputs, state, transition_params, sequence_lengths):
    """Computes the alpha values in a linear-chain CRF.
    
    See http://www.cs.columbia.edu/~mcollins/fb.pdf for reference.

    Args:
      inputs: A [batch_size, num_tags] matrix of unary potentials.
      state: A [batch_size, num_tags] matrix containing the previous alpha
    values.
      transition_params: A [num_tags, num_tags] matrix of binary potentials.
    This matrix is expanded into a [1, num_tags, num_tags] in preparation
    for the broadcast summation occurring within the cell.
      sequence_lengths: A [batch_size] vector of true sequence lengths.

    Returns:
      new_alphas: A [batch_size, num_tags] matrix containing the
      new alpha values.

    """
    sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)

    sequence_lengths = tf.maximum(
        tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 2)
    inputs = tf.transpose(inputs, [1, 0, 2])
    transition_params = tf.expand_dims(transition_params, 0)

    def _scan_fn(state, inputs):
        state = tf.expand_dims(state, 2)
        transition_scores = state + transition_params
        new_alphas = inputs + tf.reduce_logsumexp(transition_scores, [1])
        return new_alphas

    all_alphas = tf.transpose(tf.scan(_scan_fn, inputs, state), [1, 0, 2])
    idxs = tf.stack(
        [tf.range(tf.shape(sequence_lengths)[0]), sequence_lengths], axis=1)
    return tf.gather_nd(all_alphas, idxs)


def viterbi_decode(score, transition_params):
    """Decode the highest scoring sequence of tags outside of TensorFlow.
    
    This should only be used at test time.

    Args:
      score: A [seq_len, num_tags] matrix of unary potentials.
      transition_params: A [num_tags, num_tags] matrix of binary potentials.

    Returns:
      viterbi: A [seq_len] list of integers containing the highest scoring tag
      indices.
      viterbi_score: A float containing the score for the Viterbi sequence.

    """
    trellis = np.zeros_like(score)
    backpointers = np.zeros_like(score, dtype=np.int32)
    trellis[0] = score[0]

    for t in range(1, score.shape[0]):
        v = np.expand_dims(trellis[t - 1], 1) + transition_params
        trellis[t] = score[t] + np.max(v, 0)
        backpointers[t] = np.argmax(v, 0)

    viterbi = [np.argmax(trellis[-1])]
    for bp in reversed(backpointers[1:]):
        viterbi.append(bp[viterbi[-1]])
    viterbi.reverse()

    viterbi_score = np.max(trellis[-1])
    return viterbi, viterbi_score


class CrfDecodeForwardRnnCell(tf.keras.layers.AbstractRNNCell):
    """Computes the forward decoding in a linear-chain CRF."""

    def __init__(self, transition_params, **kwargs):
        """Initialize the CrfDecodeForwardRnnCell.

        Args:
          transition_params: A [num_tags, num_tags] matrix of binary
            potentials. This matrix is expanded into a
            [1, num_tags, num_tags] in preparation for the broadcast
            summation occurring within the cell.
        """
        super(CrfDecodeForwardRnnCell, self).__init__(**kwargs)
        self._transition_params = tf.expand_dims(transition_params, 0)
        self._num_tags = transition_params.shape[0]

    @property
    def state_size(self):
        return self._num_tags

    @property
    def output_size(self):
        return self._num_tags

    def build(self, input_shape):
        super(CrfDecodeForwardRnnCell, self).build(input_shape)

    def call(self, inputs, state):
        """Build the CrfDecodeForwardRnnCell.

        Args:
          inputs: A [batch_size, num_tags] matrix of unary potentials.
          state: A [batch_size, num_tags] matrix containing the previous step's
        score values.

        Returns:
          backpointers: A [batch_size, num_tags] matrix of backpointers.
          new_state: A [batch_size, num_tags] matrix of new score values.

        """
        state = tf.expand_dims(state[0], 2)
        transition_scores = state + self._transition_params
        new_state = inputs + tf.reduce_max(transition_scores, [1])
        backpointers = tf.argmax(transition_scores, 1)
        backpointers = tf.cast(backpointers, dtype=tf.int32)
        return backpointers, new_state


def crf_decode_forward(inputs, state, transition_params, sequence_lengths):
    """Computes forward decoding in a linear-chain CRF.

    Args:
      inputs: A [batch_size, num_tags] matrix of unary potentials.
      state: A [batch_size, num_tags] matrix containing the previous step's
    score values.
      transition_params: A [num_tags, num_tags] matrix of binary potentials.
      sequence_lengths: A [batch_size] vector of true sequence lengths.

    Returns:
      backpointers: A [batch_size, num_tags] matrix of backpointers.
      new_state: A [batch_size, num_tags] matrix of new score values.

    """
    sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
    mask = tf.sequence_mask(sequence_lengths, tf.shape(inputs)[1])
    crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params)
    crf_fwd_layer = tf.keras.layers.RNN(
        crf_fwd_cell, return_sequences=True, return_state=True)
    return crf_fwd_layer(inputs, state, mask=mask)


def crf_decode_backward(inputs, state):
    """Computes backward decoding in a linear-chain CRF.

    Args:
      inputs: A [batch_size, num_tags] matrix of
    backpointer of next step (in time order).
      state: A [batch_size, 1] matrix of tag index of next step.

    Returns:
      new_tags: A [batch_size, num_tags]
      tensor containing the new tag indices.

    """
    inputs = tf.transpose(inputs, [1, 0, 2])

    def _scan_fn(state, inputs):
        state = tf.squeeze(state, axis=[1])
        idxs = tf.stack([tf.range(tf.shape(inputs)[0]), state], axis=1)
        new_tags = tf.expand_dims(tf.gather_nd(inputs, idxs), axis=-1)
        return new_tags

    return tf.transpose(tf.scan(_scan_fn, inputs, state), [1, 0, 2])


def crf_decode(potentials, transition_params, sequence_length):
    """Decode the highest scoring sequence of tags in TensorFlow.
    
    This is a function for tensor.

    Args:
      potentials: A [batch_size, max_seq_len, num_tags] tensor of
    unary potentials.
      transition_params: A [num_tags, num_tags] matrix of
    binary potentials.
      sequence_length: A [batch_size] vector of true sequence lengths.

    Returns:
      decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
      Contains the highest scoring tag indices.
      best_score: A [batch_size] vector, containing the score of `decode_tags`.

    """
    sequence_length = tf.cast(sequence_length, dtype=tf.int32)

    # If max_seq_len is 1, we skip the algorithm and simply return the argmax tag
    # and the max activation.
    def _single_seq_fn():
        squeezed_potentials = tf.squeeze(potentials, [1])
        decode_tags = tf.expand_dims(tf.argmax(squeezed_potentials, axis=1), 1)
        best_score = tf.reduce_max(squeezed_potentials, axis=1)
        return tf.cast(decode_tags, dtype=tf.int32), best_score

    def _multi_seq_fn():
        """Decoding of highest scoring sequence."""
        # Computes forward decoding. Get last score and backpointers.
        initial_state = tf.slice(potentials, [0, 0, 0], [-1, 1, -1])
        initial_state = tf.squeeze(initial_state, axis=[1])
        inputs = tf.slice(potentials, [0, 1, 0], [-1, -1, -1])

        sequence_length_less_one = tf.maximum(
            tf.constant(0, dtype=sequence_length.dtype), sequence_length - 1)

        backpointers, last_score = crf_decode_forward(
            inputs, initial_state, transition_params, sequence_length_less_one)

        backpointers = tf.reverse_sequence(
            backpointers, sequence_length_less_one, seq_axis=1)

        initial_state = tf.cast(tf.argmax(last_score, axis=1), dtype=tf.int32)
        initial_state = tf.expand_dims(initial_state, axis=-1)

        decode_tags = crf_decode_backward(backpointers, initial_state)
        decode_tags = tf.squeeze(decode_tags, axis=[2])
        decode_tags = tf.concat([initial_state, decode_tags], axis=1)
        decode_tags = tf.reverse_sequence(
            decode_tags, sequence_length, seq_axis=1)

        best_score = tf.reduce_max(last_score, axis=1)
        return decode_tags, best_score

    if potentials.shape[1] == 1:
        return _single_seq_fn()
    else:
        return _multi_seq_fn()


================================================
FILE: hanlp/layers/dropout.py
================================================
# -*- coding:utf-8 -*-
# Date: 2020-06-05 17:47
from typing import List

import torch
import torch.nn as nn


class WordDropout(nn.Module):
    def __init__(self, p: float, oov_token: int, exclude_tokens: List[int] = None) -> None:
        super().__init__()
        self.oov_token = oov_token
        self.p = p
        if not exclude_tokens:
            exclude_tokens = [0]
        self.exclude = exclude_tokens

    @staticmethod
    def token_dropout(tokens: torch.LongTensor,
                      oov_token: int,
                      exclude_tokens: List[int],
                      p: float = 0.2,
                      training: float = True) -> torch.LongTensor:
        """During training, randomly replaces some of the non-padding tokens to a mask token with probability ``p``
        
        Adopted from https://github.com/Hyperparticle/udify

        Args:
          tokens: The current batch of padded sentences with word ids
          oov_token: The mask token
          exclude_tokens: The tokens for padding the input batch
          p: The probability a word gets mapped to the unknown token
          training: Applies the dropout if set to ``True``
          tokens: torch.LongTensor: 
          oov_token: int: 
          exclude_tokens: List[int]: 
          p: float:  (Default value = 0.2)
          training: float:  (Default value = True)

        Returns:
          A copy of the input batch with token dropout applied

        """
        if training and p > 0:
            # This creates a mask that only considers unpadded tokens for mapping to oov
            padding_mask = tokens.new_ones(tokens.size(), dtype=torch.bool)
            for pad in exclude_tokens:
                padding_mask &= (tokens != pad)

            # Create a uniformly random mask selecting either the original words or OOV tokens
            dropout_mask = (tokens.new_empty(tokens.size(), dtype=torch.float).uniform_() < p)
            oov_mask = dropout_mask & padding_mask

            oov_fill = tokens.new_empty(tokens.size(), dtype=torch.long).fill_(oov_token)

            result = torch.where(oov_mask, oov_fill, tokens)

            return result
        else:
            return tokens

    def forward(self, tokens: torch.LongTensor) -> torch.LongTensor:
        return self.token_dropout(tokens, self.oov_token, self.exclude, self.p, self.training)


class SharedDropout(nn.Module):

    def __init__(self, p=0.5, batch_first=True):
        super(SharedDropout, self).__init__()

        self.p = p
        self.batch_first = batch_first

    def extra_repr(self):
        s = f"p={self.p}"
        if self.batch_first:
            s += f", batch_first={self.batch_first}"

        return s

    def forward(self, x):
        if self.training:
            if self.batch_first:
                mask = self.get_mask(x[:, 0], self.p)
            else:
                mask = self.get_mask(x[0], self.p)
            x *= mask.unsqueeze(1) if self.batch_first else mask

        return x

    @staticmethod
    def get_mask(x, p):
        mask = x.new_empty(x.shape).bernoulli_(1 - p)
        mask = mask / (1 - p)

        return mask


class IndependentDropout(nn.Module):

    def __init__(self, p=0.5):
        r"""
        For :math:`N` tensors, they use different dropout masks respectively.
        When :math:`N-M` of them are dropped, the remaining :math:`M` ones are scaled by a factor of :math:`N/M` to compensate,
        and when all of them are dropped together, zeros are returned.
        Copied from https://github.com/yzhangcs/parser/master/supar/modules/dropout.py.

        Args:
            p (float):
                The probability of an element to be zeroed. Default: 0.5.

        Examples:
            >>> x, y = torch.ones(1, 3, 5), torch.ones(1, 3, 5)
            >>> x, y = IndependentDropout()(x, y)
            >>> x
            tensor([[[1., 1., 1., 1., 1.],
                     [0., 0., 0., 0., 0.],
                     [2., 2., 2., 2., 2.]]])
            >>> y
            tensor([[[1., 1., 1., 1., 1.],
                     [2., 2., 2., 2., 2.],
                     [0., 0., 0., 0., 0.]]])
        """
        super(IndependentDropout, self).__init__()
        self.p = p

    def extra_repr(self):
        return f"p={self.p}"

    def forward(self, *items):
        if self.training:
            masks = [x.new_empty(x.shape[:2]).bernoulli_(1 - self.p)
                     for x in items]
            total = sum(masks)
            scale = len(items) / total.max(torch.ones_like(total))
            masks = [mask * scale for mask in masks]
            items = [item * mask.unsqueeze(dim=-1)
                     for item, mask in zip(items, masks)]

        return items


class LockedDropout(nn.Module):
    def __init__(self, dropout_rate=0.5):
        super(LockedDropout, self).__init__()
        self.dropout_rate = dropout_rate

    def forward(self, x):
        if not self.training or not self.dropout_rate:
            return x

        if x.dim() == 3:
            mask = x.new(x.size(0), 1, x.size(2)).bernoulli_(1 - self.dropout_rate) / (1 - self.dropout_rate)
            mask = mask.expand_as(x)
        elif x.dim() == 2:
            mask = torch.empty_like(x).bernoulli_(1 - self.dropout_rate) / (1 - self.dropout_rate)
        else:
            raise ValueError(f'Unsupported dim: {x.dim()}. Only 2d (T,C) or 3d (B,T,C) is supported')
        return mask * x


================================================
FILE: hanlp/layers/embeddings/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-24 21:48


================================================
FILE: hanlp/layers/embeddings/char_cnn.py
================================================
# Adopted from https://github.com/allenai/allennlp under Apache Licence 2.0.
# Changed the packaging and created a subclass CharCNNEmbedding

from typing import Union, Tuple, Optional, Callable
import torch
from torch import nn
from hanlp.layers.cnn_encoder import CnnEncoder
from hanlp.layers.time_distributed import TimeDistributed
from hanlp_common.configurable import AutoConfigurable
from hanlp.common.transform import VocabDict, ToChar
from hanlp.common.vocab import Vocab
from hanlp.layers.embeddings.embedding import EmbeddingDim, Embedding


class CharCNN(nn.Module):
    def __init__(self,
                 field: str,
                 embed: Union[int, Embedding], num_filters: int,
                 ngram_filter_sizes: Tuple[int, ...] = (2, 3, 4, 5),
                 conv_layer_activation: str = 'ReLU',
                 output_dim: Optional[int] = None,
                 vocab_size=None) -> None:
        """A `CnnEncoder` is a combination of multiple convolution layers and max pooling layers.
        The input to this module is of shape `(batch_size, num_tokens,
        input_dim)`, and the output is of shape `(batch_size, output_dim)`.

        The CNN has one convolution layer for each ngram filter size. Each convolution operation gives
        out a vector of size num_filters. The number of times a convolution layer will be used
        is `num_tokens - ngram_size + 1`. The corresponding maxpooling layer aggregates all these
        outputs from the convolution layer and outputs the max.

        This operation is repeated for every ngram size passed, and consequently the dimensionality of
        the output after maxpooling is `len(ngram_filter_sizes) * num_filters`.  This then gets
        (optionally) projected down to a lower dimensional output, specified by `output_dim`.

        We then use a fully connected layer to project in back to the desired output_dim.  For more
        details, refer to "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural
        Networks for Sentence Classification", Zhang and Wallace 2016, particularly Figure 1.

        See allennlp.modules.seq2vec_encoders.cnn_encoder.CnnEncoder, Apache 2.0

        Args:
            field: The field in samples this encoder will work on.
            embed: An ``Embedding`` object or the feature size to create an ``Embedding`` object.
            num_filters: This is the output dim for each convolutional layer, which is the number of "filters"
                learned by that layer.
            ngram_filter_sizes: This specifies both the number of convolutional layers we will create and their sizes.  The
                default of `(2, 3, 4, 5)` will have four convolutional layers, corresponding to encoding
                ngrams of size 2 to 5 with some number of filters.
            conv_layer_activation: `Activation`, optional (default=`torch.nn.ReLU`)
                Activation to use after the convolution layers.
            output_dim: After doing convolutions and pooling, we'll project the collected features into a vector of
                this size.  If this value is `None`, we will just return the result of the max pooling,
                giving an output of shape `len(ngram_filter_sizes) * num_filters`.
            vocab_size: The size of character vocab.

        Returns:
            A tensor of shape `(batch_size, output_dim)`.
        """
        super().__init__()
        EmbeddingDim.__init__(self)
        # the embedding layer
        if isinstance(embed, int):
            embed = nn.Embedding(num_embeddings=vocab_size,
                                 embedding_dim=embed)
        else:
            raise ValueError(f'Unrecognized type for {embed}')
        self.field = field
        self.embed = TimeDistributed(embed)
        self.encoder = TimeDistributed(
            CnnEncoder(embed.embedding_dim, num_filters, ngram_filter_sizes, conv_layer_activation, output_dim))
        self.embedding_dim = output_dim or num_filters * len(ngram_filter_sizes)

    def forward(self, batch: dict, **kwargs):
        tokens: torch.Tensor = batch[f'{self.field}_char_id']
        mask = tokens.ge(0)
        x = self.embed(tokens)
        return self.encoder(x, mask)

    def get_output_dim(self) -> int:
        return self.embedding_dim


class CharCNNEmbedding(Embedding, AutoConfigurable):
    def __init__(self,
                 field,
                 embed: Union[int, Embedding],
                 num_filters: int,
                 ngram_filter_sizes: Tuple[int, ...] = (2, 3, 4, 5),
                 conv_layer_activation: str = 'ReLU',
                 output_dim: Optional[int] = None,
                 min_word_length=None
                 ) -> None:
        """

        Args:
            field: The character field in samples this encoder will work on.
            embed: An ``Embedding`` object or the feature size to create an ``Embedding`` object.
            num_filters: This is the output dim for each convolutional layer, which is the number of "filters"
                learned by that layer.
            ngram_filter_sizes: This specifies both the number of convolutional layers we will create and their sizes.  The
                default of `(2, 3, 4, 5)` will have four convolutional layers, corresponding to encoding
                ngrams of size 2 to 5 with some number of filters.
            conv_layer_activation: `Activation`, optional (default=`torch.nn.ReLU`)
                Activation to use after the convolution layers.
            output_dim: After doing convolutions and pooling, we'll project the collected features into a vector of
                this size.  If this value is `None`, we will just return the result of the max pooling,
                giving an output of shape `len(ngram_filter_sizes) * num_filters`.
            min_word_length: For ngram filter with max size, the input (chars) is required to have at least max size
                chars.
        """
        super().__init__()
        if min_word_length is None:
            min_word_length = max(ngram_filter_sizes)
        self.min_word_length = min_word_length
        self.output_dim = output_dim
        self.conv_layer_activation = conv_layer_activation
        self.ngram_filter_sizes = ngram_filter_sizes
        self.num_filters = num_filters
        self.embed = embed
        self.field = field

    def transform(self, vocabs: VocabDict, **kwargs) -> Optional[Callable]:
        if isinstance(self.embed, Embedding):
            self.embed.transform(vocabs=vocabs)
        vocab_name = self.vocab_name
        if vocab_name not in vocabs:
            vocabs[vocab_name] = Vocab()
        return ToChar(self.field, vocab_name, min_word_length=self.min_word_length,
                      pad=vocabs[vocab_name].safe_pad_token)

    @property
    def vocab_name(self):
        vocab_name = f'{self.field}_char'
        return vocab_name

    def module(self, vocabs: VocabDict, **kwargs) -> Optional[nn.Module]:
        embed = self.embed
        if isinstance(embed, Embedding):
            embed = embed.module(vocabs=vocabs)
        return CharCNN(self.field,
                       embed,
                       self.num_filters,
                       self.ngram_filter_sizes,
                       self.conv_layer_activation,
                       self.output_dim,
                       vocab_size=len(vocabs[self.vocab_name]))


================================================
FILE: hanlp/layers/embeddings/char_cnn_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-20 21:15
from functools import reduce

import tensorflow as tf

from hanlp.common.vocab_tf import VocabTF
from hanlp.utils.tf_util import hanlp_register


@hanlp_register
class CharCNNEmbeddingTF(tf.keras.layers.Layer):
    def __init__(self, word_vocab: VocabTF, char_vocab: VocabTF,
                 char_embedding=100,
                 kernel_size=3,
                 filters=50,
                 dropout=0.5,
                 trainable=True, name=None, dtype=None, dynamic=False,
                 **kwargs):
        super().__init__(trainable, name, dtype, dynamic, **kwargs)
        self.char_embedding = char_embedding
        self.filters = filters
        self.kernel_size = kernel_size
        self.char_vocab = char_vocab
        self.word_vocab = word_vocab
        self.embedding = tf.keras.layers.Embedding(input_dim=len(self.char_vocab), output_dim=char_embedding,
                                                   trainable=True, mask_zero=True)
        self.dropout = tf.keras.layers.Dropout(dropout)
        self.cnn = tf.keras.layers.Conv1D(filters, kernel_size, padding='same')

    def call(self, inputs: tf.Tensor, **kwargs):
        mask = tf.not_equal(inputs, self.word_vocab.pad_token)
        inputs = tf.ragged.boolean_mask(inputs, mask)
        chars = tf.strings.unicode_split(inputs, input_encoding='UTF-8')
        chars = chars.to_tensor(default_value=self.char_vocab.pad_token)
        chars = self.char_vocab.lookup(chars)
        embed = self.embedding(chars)
        weights = embed._keras_mask
        embed = self.dropout(embed)
        features = masked_conv1d_and_max(embed, weights, self.cnn)
        features._keras_mask = mask
        return features

    def compute_output_shape(self, input_shape):
        return super().compute_output_shape(input_shape)

    def get_config(self):
        config = {
            'char_embedding': self.char_embedding,
            'kernel_size': self.kernel_size,
            'filters': self.filters,
            'dropout': self.dropout.rate,
        }
        base_config = super(CharCNNEmbeddingTF, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


def masked_conv1d_and_max(t, weights, conv1d):
    """Applies 1d convolution and a masked max-pooling
    
    https://github.com/guillaumegenthial/tf_ner/blob/master/models/chars_conv_lstm_crf/masked_conv.py

    Args:
      t(tf.Tensor): A tensor with at least 3 dimensions [d1, d2, ..., dn-1, dn]
      weights(tf.Tensor of tf.bool): A Tensor of shape [d1, d2, dn-1]
      filters(int): number of filters
      kernel_size(int): kernel size for the temporal convolution
      conv1d: 

    Returns:

    
    """
    # Get shape and parameters
    shape = tf.shape(t)
    ndims = t.shape.ndims
    dim1 = reduce(lambda x, y: x * y, [shape[i] for i in range(ndims - 2)])
    dim2 = shape[-2]
    dim3 = t.shape[-1]

    # Reshape weights
    weights = tf.reshape(weights, shape=[dim1, dim2, 1])
    weights = tf.cast(weights, tf.float32)

    # Reshape input and apply weights
    flat_shape = [dim1, dim2, dim3]
    t = tf.reshape(t, shape=flat_shape)
    t *= weights

    # Apply convolution
    t_conv = conv1d(t)
    t_conv *= weights

    # Reduce max -- set to zero if all padded
    t_conv += (1. - weights) * tf.reduce_min(t_conv, axis=-2, keepdims=True)
    t_max = tf.reduce_max(t_conv, axis=-2)

    # Reshape the output
    final_shape = [shape[i] for i in range(ndims - 2)] + [conv1d.filters]
    t_max = tf.reshape(t_max, shape=final_shape)

    return t_max


================================================
FILE: hanlp/layers/embeddings/char_rnn.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-02 23:49
from typing import Optional, Callable, Union

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence

from hanlp_common.configurable import AutoConfigurable
from hanlp.common.transform import VocabDict, ToChar
from hanlp.common.vocab import Vocab
from hanlp.layers.embeddings.embedding import Embedding, EmbeddingDim


class CharRNN(nn.Module, EmbeddingDim):
    def __init__(self,
                 field,
                 vocab_size,
                 embed: Union[int, nn.Embedding],
                 hidden_size):
        """Character level RNN embedding module.

        Args:
            field: The field in samples this encoder will work on.
            vocab_size: The size of character vocab.
            embed: An ``Embedding`` object or the feature size to create an ``Embedding`` object.
            hidden_size: The hidden size of RNNs.
        """
        super(CharRNN, self).__init__()
        self.field = field
        # the embedding layer
        if isinstance(embed, int):
            self.embed = nn.Embedding(num_embeddings=vocab_size,
                                      embedding_dim=embed)
        elif isinstance(embed, nn.Module):
            self.embed = embed
            embed = embed.embedding_dim
        else:
            raise ValueError(f'Unrecognized type for {embed}')
        # the lstm layer
        self.lstm = nn.LSTM(input_size=embed,
                            hidden_size=hidden_size,
                            batch_first=True,
                            bidirectional=True)

    def forward(self, batch, mask, **kwargs):
        x = batch[f'{self.field}_char_id']
        # [batch_size, seq_len, fix_len]
        mask = x.ne(0)
        # [batch_size, seq_len]
        lens = mask.sum(-1)
        char_mask = lens.gt(0)

        # [n, fix_len, n_embed]
        x = self.embed(batch) if isinstance(self.embed, EmbeddingDim) else self.embed(x[char_mask])
        x = pack_padded_sequence(x[char_mask], lens[char_mask].cpu(), True, False)
        x, (h, _) = self.lstm(x)
        # [n, fix_len, n_out]
        h = torch.cat(torch.unbind(h), -1)
        # [batch_size, seq_len, n_out]
        embed = h.new_zeros(*lens.shape, h.size(-1))
        embed = embed.masked_scatter_(char_mask.unsqueeze(-1), h)

        return embed

    @property
    def embedding_dim(self) -> int:
        return self.lstm.hidden_size * 2


class CharRNNEmbedding(Embedding, AutoConfigurable):
    def __init__(self,
                 field,
                 embed,
                 hidden_size,
                 max_word_length=None) -> None:
        """Character level RNN embedding module builder.

        Args:
            field: The field in samples this encoder will work on.
            embed: An ``Embedding`` object or the feature size to create an ``Embedding`` object.
            hidden_size: The hidden size of RNNs.
            max_word_length: Character sequence longer than ``max_word_length`` will be truncated.
        """
        super().__init__()
        self.field = field
        self.hidden_size = hidden_size
        self.embed = embed
        self.max_word_length = max_word_length

    def transform(self, vocabs: VocabDict, **kwargs) -> Optional[Callable]:
        if isinstance(self.embed, Embedding):
            self.embed.transform(vocabs=vocabs)
        vocab_name = self.vocab_name
        if vocab_name not in vocabs:
            vocabs[vocab_name] = Vocab()
        return ToChar(self.field, vocab_name, max_word_length=self.max_word_length)

    @property
    def vocab_name(self):
        vocab_name = f'{self.field}_char'
        return vocab_name

    def module(self, vocabs: VocabDict, **kwargs) -> Optional[nn.Module]:
        embed = self.embed
        if isinstance(self.embed, Embedding):
            embed = self.embed.module(vocabs=vocabs)
        return CharRNN(self.field, len(vocabs[self.vocab_name]), embed, self.hidden_size)


================================================
FILE: hanlp/layers/embeddings/char_rnn_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-20 17:02
import tensorflow as tf

from hanlp.common.vocab_tf import VocabTF
from hanlp.utils.tf_util import hanlp_register


@hanlp_register
class CharRNNEmbeddingTF(tf.keras.layers.Layer):
    def __init__(self, word_vocab: VocabTF, char_vocab: VocabTF,
                 char_embedding=100,
                 char_rnn_units=25,
                 dropout=0.5,
                 trainable=True, name=None, dtype=None, dynamic=False,
                 **kwargs):
        super().__init__(trainable, name, dtype, dynamic, **kwargs)
        self.char_embedding = char_embedding
        self.char_rnn_units = char_rnn_units
        self.char_vocab = char_vocab
        self.word_vocab = word_vocab
        self.embedding = tf.keras.layers.Embedding(input_dim=len(self.char_vocab), output_dim=char_embedding,
                                                   trainable=True, mask_zero=True)
        self.dropout = tf.keras.layers.Dropout(dropout)
        self.rnn = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=char_rnn_units,
                                                                      return_state=True), name='bilstm')

    def call(self, inputs: tf.Tensor, **kwargs):
        mask = tf.not_equal(inputs, self.word_vocab.pad_token)
        inputs = tf.ragged.boolean_mask(inputs, mask)
        chars = tf.strings.unicode_split(inputs, input_encoding='UTF-8')
        chars = chars.to_tensor(default_value=self.char_vocab.pad_token)
        chars = self.char_vocab.lookup(chars)
        embed = self.embedding(chars)
        char_mask = embed._keras_mask
        embed = self.dropout(embed)
        embed_shape = tf.shape(embed)
        embed = tf.reshape(embed, [-1, embed_shape[2], embed_shape[3]])
        char_mask = tf.reshape(char_mask, [-1, embed_shape[2]])
        all_zeros = tf.reduce_sum(tf.cast(char_mask, tf.int32), axis=1) == 0
        char_mask_shape = tf.shape(char_mask)
        hole = tf.zeros(shape=(char_mask_shape[0], char_mask_shape[1] - 1), dtype=tf.bool)
        all_zeros = tf.expand_dims(all_zeros, -1)
        non_all_zeros = tf.concat([all_zeros, hole], axis=1)
        char_mask = tf.logical_or(char_mask, non_all_zeros)
        output, h_fw, c_fw, h_bw, c_bw = self.rnn(embed, mask=char_mask)
        hidden = tf.concat([h_fw, h_bw], axis=-1)
        # hidden = output
        hidden = tf.reshape(hidden, [embed_shape[0], embed_shape[1], -1])
        hidden._keras_mask = mask
        return hidden

    def get_config(self):
        config = {
            'char_embedding': self.char_embedding,
            'char_rnn_units': self.char_rnn_units,
            'dropout': self.dropout.rate,
        }
        base_config = super(CharRNNEmbeddingTF, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


================================================
FILE: hanlp/layers/embeddings/concat_embedding.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-20 17:08
import tensorflow as tf

from hanlp.utils.tf_util import hanlp_register, copy_mask


@hanlp_register
class ConcatEmbedding(tf.keras.layers.Layer):
    def __init__(self, *embeddings, trainable=True, name=None, dtype=None, dynamic=False, **kwargs):
        self.embeddings = []
        for embed in embeddings:
            embed: tf.keras.layers.Layer = tf.keras.utils.deserialize_keras_object(embed) if isinstance(embed,
                                                                                                        dict) else embed
            self.embeddings.append(embed)
            if embed.trainable:
                trainable = True
            if embed.dynamic:
                dynamic = True
            if embed.supports_masking:
                self.supports_masking = True

        super().__init__(trainable, name, dtype, dynamic, **kwargs)

    def build(self, input_shape):
        for embed in self.embeddings:
            embed.build(input_shape)
        super().build(input_shape)

    def compute_mask(self, inputs, mask=None):
        for embed in self.embeddings:
            mask = embed.compute_mask(inputs, mask)
            if mask is not None:
                return mask
        return mask

    def call(self, inputs, **kwargs):
        embeds = [embed.call(inputs) for embed in self.embeddings]
        feature = tf.concat(embeds, axis=-1)

        for embed in embeds:
            mask = copy_mask(embed, feature)
            if mask is not None:
                break
        return feature

    def get_config(self):
        config = {
            'embeddings': [embed.get_config() for embed in self.embeddings],
        }
        base_config = super(ConcatEmbedding, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def compute_output_shape(self, input_shape):
        dim = 0
        for embed in self.embeddings:
            dim += embed.compute_output_shape(input_shape)[-1]

        return input_shape + dim


================================================
FILE: hanlp/layers/embeddings/contextual_string_embedding.py
================================================
# Most codes of this file is adopted from flair, which is licenced under:
#
# The MIT License (MIT)
#
# Flair is licensed under the following MIT License (MIT) Copyright © 2018 Zalando SE, https://tech.zalando.com
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

import os
from typing import List, Dict, Callable

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from hanlp_common.configurable import Configurable
from hanlp.common.transform import TransformList, FieldToIndex
from hanlp.common.vocab import Vocab
from hanlp.layers.embeddings.embedding import Embedding, EmbeddingDim
from hanlp.utils.io_util import get_resource
from hanlp.utils.torch_util import pad_lists, batched_index_select
from tests import cdroot


class RNNLanguageModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self,
                 n_tokens,
                 is_forward_lm: bool,
                 hidden_size: int,
                 embedding_size: int = 100):
        super(RNNLanguageModel, self).__init__()

        self.is_forward_lm: bool = is_forward_lm
        self.n_tokens = n_tokens
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size

        self.encoder = nn.Embedding(n_tokens, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, batch_first=True)

    def forward(self, ids: torch.LongTensor, lens: torch.LongTensor):
        emb = self.encoder(ids)
        x = pack_padded_sequence(emb, lens, True, False)
        x, _ = self.rnn(x)
        x, _ = pad_packed_sequence(x, True)
        return x

    @classmethod
    def load_language_model(cls, model_file):
        model_file = get_resource(model_file)
        state = torch.load(model_file)
        model = RNNLanguageModel(state['n_tokens'],
                                 state['is_forward_lm'],
                                 state['hidden_size'],
                                 state['embedding_size'])
        model.load_state_dict(state['state_dict'], strict=False)
        return model

    def save(self, file):
        model_state = {
            'state_dict': self.state_dict(),
            'n_tokens': self.n_tokens,
            'is_forward_lm': self.is_forward_lm,
            'hidden_size': self.hidden_size,
            'embedding_size': self.embedding_size,
        }
        torch.save(model_state, file, pickle_protocol=4)


class ContextualStringEmbeddingModule(nn.Module, EmbeddingDim):

    def __init__(self, field: str, path: str, trainable=False) -> None:
        super().__init__()
        self.field = field
        path = get_resource(path)
        f = os.path.join(path, 'forward.pt')
        b = os.path.join(path, 'backward.pt')
        self.f: RNNLanguageModel = RNNLanguageModel.load_language_model(f)
        self.b: RNNLanguageModel = RNNLanguageModel.load_language_model(b)
        if not trainable:
            for p in self.parameters():
                p.requires_grad_(False)

    def __call__(self, batch: dict, **kwargs):
        args = ['f_char_id', 'f_offset', 'b_char_id', 'b_offset']
        keys = [f'{self.field}_{key}' for key in args]
        args = [batch[key] for key in keys]
        return super().__call__(*args, **kwargs)

    @property
    def embedding_dim(self):
        return self.f.rnn.hidden_size + self.b.rnn.hidden_size

    def run_lm(self, lm, ids: torch.Tensor, offsets: torch.LongTensor):
        lens = offsets.max(-1)[0] + 1
        rnn_output = lm(ids, lens)
        return batched_index_select(rnn_output, offsets)

    def forward(self,
                f_chars_id: torch.Tensor,
                f_offset: torch.LongTensor,
                b_chars_id: torch.Tensor,
                b_offset: torch.LongTensor, **kwargs):
        f = self.run_lm(self.f, f_chars_id, f_offset)
        b = self.run_lm(self.b, b_chars_id, b_offset)
        return torch.cat([f, b], dim=-1)

    def embed(self, sents: List[List[str]], vocab: Dict[str, int]):
        f_chars, f_offsets = [], []
        b_chars, b_offsets = [], []

        transform = ContextualStringEmbeddingTransform('token')
        for tokens in sents:
            sample = transform({'token': tokens})
            for each, name in zip([f_chars, b_chars, f_offsets, b_offsets],
                                  'f_chars, b_chars, f_offsets, b_offsets'.split(', ')):
                each.append(sample[f'token_{name}'])
        f_ids = []
        for cs in f_chars:
            f_ids.append([vocab[c] for c in cs])
        f_ids = pad_lists(f_ids)
        f_offsets = pad_lists(f_offsets)

        b_ids = []
        for cs in b_chars:
            b_ids.append([vocab[c] for c in cs])
        b_ids = pad_lists(b_ids)
        b_offsets = pad_lists(b_offsets)
        return self.forward(f_ids, f_offsets, b_ids, b_offsets)


class ContextualStringEmbeddingTransform(Configurable):

    def __init__(self, src: str) -> None:
        self.src = src

    def __call__(self, sample: dict):
        tokens = sample[self.src]
        f_o = []
        b_o = []
        sentence_text = ' '.join(tokens)
        end_marker = ' '
        extra_offset = 1
        # f
        input_text = '\n' + sentence_text + end_marker
        f_chars = list(input_text)
        # b
        sentence_text = sentence_text[::-1]
        input_text = '\n' + sentence_text + end_marker
        b_chars = list(input_text)
        offset_forward: int = extra_offset
        offset_backward: int = len(sentence_text) + extra_offset
        for token in tokens:
            offset_forward += len(token)

            f_o.append(offset_forward)
            b_o.append(offset_backward)

            # This language model is tokenized
            offset_forward += 1
            offset_backward -= 1

            offset_backward -= len(token)
        sample[f'{self.src}_f_char'] = f_chars
        sample[f'{self.src}_b_char'] = b_chars
        sample[f'{self.src}_f_offset'] = f_o
        sample[f'{self.src}_b_offset'] = b_o
        return sample


class ContextualStringEmbedding(Embedding):
    def __init__(self, field, path, trainable=False) -> None:
        super().__init__()
        self.trainable = trainable
        self.path = path
        self.field = field

    def transform(self, **kwargs) -> Callable:
        vocab = Vocab()
        vocab.load(os.path.join(get_resource(self.path), 'vocab.json'))
        return TransformList(ContextualStringEmbeddingTransform(self.field),
                             FieldToIndex(f'{self.field}_f_char', vocab),
                             FieldToIndex(f'{self.field}_b_char', vocab))

    def module(self, **kwargs) -> nn.Module:
        return ContextualStringEmbeddingModule(self.field, self.path, self.trainable)


def main():
    # _validate()
    flair = ContextualStringEmbedding('token', 'FASTTEXT_DEBUG_EMBEDDING_EN')
    print(flair.config)


def _validate():
    cdroot()
    flair = ContextualStringEmbeddingModule('token', 'FLAIR_LM_WMT11_EN')
    vocab = torch.load('/home/hhe43/flair/item2idx.pt')
    vocab = dict((x.decode(), y) for x, y in vocab.items())
    # vocab = Vocab(token_to_idx=vocab, pad_token='<unk>')
    # vocab.lock()
    # vocab.summary()
    # vocab.save('vocab.json')
    tokens = 'I love Berlin .'.split()
    sent = ' '.join(tokens)
    embed = flair.embed([tokens, tokens], vocab)
    gold = torch.load('/home/hhe43/flair/gold.pt')
    print(torch.allclose(embed[1, :, :2048], gold, atol=1e-6))
    # print(torch.all(torch.eq(embed[1, :, :], gold)))


if __name__ == '__main__':
    main()


================================================
FILE: hanlp/layers/embeddings/contextual_string_embedding_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-19 03:24
from typing import List

import tensorflow as tf
import numpy as np
from hanlp.components.rnn_language_model_tf import RNNLanguageModel
from hanlp_common.constant import PAD
from hanlp.utils.io_util import get_resource
from hanlp.utils.tf_util import copy_mask, hanlp_register, str_tensor_2d_to_list
from hanlp_common.util import infer_space_after


@hanlp_register
class ContextualStringEmbeddingTF(tf.keras.layers.Layer):

    def __init__(self, forward_model_path=None, backward_model_path=None, max_word_len=10,
                 trainable=False, name=None, dtype=None,
                 dynamic=True, **kwargs):
        assert dynamic, 'ContextualStringEmbedding works only in eager mode'
        super().__init__(trainable, name, dtype, dynamic, **kwargs)
        assert any([forward_model_path, backward_model_path]), 'At least one model is required'
        self.forward_model_path = forward_model_path
        self.backward_model_path = backward_model_path
        self.forward_model = self._load_lm(forward_model_path) if forward_model_path else None
        self.backward_model = self._load_lm(backward_model_path) if backward_model_path else None
        if trainable:
            self._fw = self.forward_model.model
            self._bw = self.backward_model.model
            for m in self._fw, self._bw:
                m.trainable = True
        self.supports_masking = True
        self.max_word_len = max_word_len

    def call(self, inputs, **kwargs):
        str_inputs = str_tensor_2d_to_list(inputs)
        outputs = self.embed(str_inputs)
        copy_mask(inputs, outputs)
        return outputs

    def _load_lm(self, filepath):
        filepath = get_resource(filepath)
        lm = RNNLanguageModel()
        lm.load(filepath)
        model: tf.keras.Sequential = lm.model
        for idx, layer in enumerate(model.layers):
            if isinstance(layer, tf.keras.layers.LSTM):
                lm.model = tf.keras.Sequential(model.layers[:idx + 1])  # discard dense layer
                return lm

    def embed(self, texts: List[List[str]]):
        """Embedding sentences (list of words) with contextualized string embedding

        Args:
          texts: List of words, not chars
          texts: List[List[str]]: 

        Returns:

        
        """
        fw = None
        if self.forward_model:
            fw = self._run_rnn(texts, model=self.forward_model)
        bw = None
        if self.backward_model:
            bw = self._run_rnn(texts, model=self.backward_model)
        if not all(x is not None for x in [fw, bw]):
            return fw if fw is not None else bw
        else:
            return tf.concat([fw, bw], axis=-1)

    def _run_rnn(self, texts, model):
        embeddings = []
        inputs = []
        offsets = []
        tokenizer = model.transform.tokenize_func()
        backward = not model.config['forward']
        for sent in texts:
            raw, off = self._get_raw_string(sent, tokenizer)
            inputs.append(raw)
            offsets.append(off)
        outputs = model.model_from_config.predict(model.transform.inputs_to_dataset(inputs))
        if backward:
            outputs = tf.reverse(outputs, axis=[1])
        maxlen = len(max(texts, key=len))
        for hidden, off, sent in zip(outputs, offsets, texts):
            embed = []
            for (start, end), word in zip(off, sent):
                embed.append(hidden[end - 1, :])
            if len(embed) < maxlen:
                embed += [np.zeros_like(embed[-1])] * (maxlen - len(embed))
            embeddings.append(np.stack(embed))
        return tf.stack(embeddings)

    def _get_raw_string(self, sent: List[str], tokenizer):
        raw_string = []
        offsets = []
        whitespace_after = infer_space_after(sent)
        start = 0
        for word, space in zip(sent, whitespace_after):
            chars = tokenizer(word)
            chars = chars[:self.max_word_len]
            if space:
                chars += [' ']
            end = start + len(chars)
            offsets.append((start, end))
            start = end
            raw_string += chars
        return raw_string, offsets

    def get_config(self):
        config = {
            'forward_model_path': self.forward_model_path,
            'backward_model_path': self.backward_model_path,
            'max_word_len': self.max_word_len,
        }
        base_config = super(ContextualStringEmbeddingTF, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    @property
    def output_dim(self):
        dim = 0
        for model in self.forward_model, self.backward_model:
            if model:
                dim += model.config['rnn_units']
        return dim

    def compute_output_shape(self, input_shape):
        return input_shape + self.output_dim

    def compute_mask(self, inputs, mask=None):

        return tf.not_equal(inputs, PAD)


================================================
FILE: hanlp/layers/embeddings/contextual_word_embedding.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-05 13:50
from typing import Optional, Union, List, Any, Dict, Tuple

import torch
from torch import nn

from hanlp_common.configurable import AutoConfigurable
from hanlp.layers.embeddings.embedding import Embedding
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.layers.transformers.encoder import TransformerEncoder
from hanlp.layers.transformers.pt_imports import PreTrainedTokenizer, AutoConfig_, AutoTokenizer_
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer


class ContextualWordEmbeddingModule(TransformerEncoder):
    def __init__(self,
                 field: str,
                 transformer: str,
                 transformer_tokenizer: PreTrainedTokenizer,
                 average_subwords=False,
                 scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None,
                 word_dropout=None,
                 max_sequence_length=None,
                 ret_raw_hidden_states=False,
                 transformer_args: Dict[str, Any] = None,
                 trainable=True,
                 training=True) -> None:
        """A contextualized word embedding module.

        Args:
            field: The field to work on. Usually some token fields.
            transformer:  An identifier of a ``PreTrainedModel``.
            transformer_tokenizer:
            average_subwords: ``True`` to average subword representations.
            scalar_mix: Layer attention.
            word_dropout: Dropout rate of randomly replacing a subword with MASK.
            max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding
                window.
            ret_raw_hidden_states: ``True`` to return hidden states of each layer.
            transformer_args: Extra arguments passed to the transformer.
            trainable: ``False`` to use static embeddings.
            training: ``False`` to skip loading weights from pre-trained transformers.
        """
        super().__init__(transformer, transformer_tokenizer, average_subwords, scalar_mix, word_dropout,
                         max_sequence_length, ret_raw_hidden_states, transformer_args, trainable,
                         training)
        self.field = field

    # noinspection PyMethodOverriding
    # noinspection PyTypeChecker
    def forward(self, batch: dict, mask=None, **kwargs):
        input_ids: torch.LongTensor = batch[f'{self.field}_input_ids']
        token_span: torch.LongTensor = batch.get(f'{self.field}_token_span', None)
        # input_device = input_ids.device
        # this_device = self.get_device()
        # if input_device != this_device:
        #     input_ids = input_ids.to(this_device)
        #     token_span = token_span.to(this_device)
        # We might want to apply mask here
        output: Union[torch.Tensor, List[torch.Tensor]] = super().forward(input_ids, token_span=token_span, **kwargs)
        # if input_device != this_device:
        #     if isinstance(output, torch.Tensor):
        #         output = output.to(input_device)
        #     else:
        #         output = [x.to(input_device) for x in output]
        return output

    def get_output_dim(self):
        return self.transformer.config.hidden_size

    def get_device(self):
        device: torch.device = next(self.parameters()).device
        return device


class ContextualWordEmbedding(Embedding, AutoConfigurable):
    def __init__(self, field: str,
                 transformer: str,
                 average_subwords=False,
                 scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None,
                 word_dropout: Optional[Union[float, Tuple[float, str]]] = None,
                 max_sequence_length=None,
                 truncate_long_sequences=False,
                 cls_is_bos=False,
                 sep_is_eos=False,
                 ret_token_span=True,
                 ret_subtokens=False,
                 ret_subtokens_group=False,
                 ret_prefix_mask=False,
                 ret_raw_hidden_states=False,
                 transformer_args: Dict[str, Any] = None,
                 use_fast=True,
                 do_basic_tokenize=True,
                 trainable=True) -> None:
        """A contextual word embedding builder which builds a
        :class:`~hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbeddingModule` and a
        :class:`~hanlp.transform.transformer_tokenizer.TransformerSequenceTokenizer`.

        Args:
            field: The field to work on. Usually some token fields.
            transformer:  An identifier of a ``PreTrainedModel``.
            average_subwords: ``True`` to average subword representations.
            scalar_mix: Layer attention.
            word_dropout: Dropout rate of randomly replacing a subword with MASK.
            max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding
                window.
            truncate_long_sequences: ``True`` to return hidden states of each layer.
            cls_is_bos: ``True`` means the first token of input is treated as [CLS] no matter what its surface form is.
                        ``False`` (default) means the first token is not [CLS], it will have its own embedding other than
                        the embedding of [CLS].
            sep_is_eos: ``True`` means the last token of input is [SEP].
                        ``False`` means it's not but [SEP] will be appended,
                        ``None`` means it dependents on `input[-1] == [EOS]`.
            ret_token_span: ``True`` to return span of each token measured by subtoken offsets.
            ret_subtokens: ``True`` to return list of subtokens belonging to each token.
            ret_subtokens_group: ``True`` to return list of offsets of subtokens belonging to each token.
            ret_prefix_mask: ``True`` to generate a mask where each non-zero element corresponds to a prefix of a token.
            ret_raw_hidden_states: ``True`` to return hidden states of each layer.
            transformer_args: Extra arguments passed to the transformer.
            use_fast: Whether or not to try to load the fast version of the tokenizer.
            do_basic_tokenize: Whether to do basic tokenization before wordpiece.
            trainable: ``False`` to use static embeddings.
        """
        super().__init__()
        self.truncate_long_sequences = truncate_long_sequences
        self.transformer_args = transformer_args
        self.trainable = trainable
        self.ret_subtokens_group = ret_subtokens_group
        self.ret_subtokens = ret_subtokens
        self.ret_raw_hidden_states = ret_raw_hidden_states
        self.sep_is_eos = sep_is_eos
        self.cls_is_bos = cls_is_bos
        self.max_sequence_length = max_sequence_length
        self.word_dropout = word_dropout
        self.scalar_mix = scalar_mix
        self.average_subwords = average_subwords
        self.transformer = transformer
        self.field = field
        self._transformer_tokenizer = AutoTokenizer_.from_pretrained(self.transformer,
                                                                     use_fast=use_fast,
                                                                     do_basic_tokenize=do_basic_tokenize)
        self._tokenizer_transform = TransformerSequenceTokenizer(self._transformer_tokenizer,
                                                                 field,
                                                                 truncate_long_sequences=truncate_long_sequences,
                                                                 ret_prefix_mask=ret_prefix_mask,
                                                                 ret_token_span=ret_token_span,
                                                                 cls_is_bos=cls_is_bos,
                                                                 sep_is_eos=sep_is_eos,
                                                                 ret_subtokens=ret_subtokens,
                                                                 ret_subtokens_group=ret_subtokens_group,
                                                                 max_seq_length=self.max_sequence_length
                                                                 )

    def transform(self, **kwargs) -> TransformerSequenceTokenizer:
        return self._tokenizer_transform

    def module(self, training=True, **kwargs) -> Optional[nn.Module]:
        return ContextualWordEmbeddingModule(self.field,
                                             self.transformer,
                                             self._transformer_tokenizer,
                                             self.average_subwords,
                                             self.scalar_mix,
                                             self.word_dropout,
                                             self.max_sequence_length,
                                             self.ret_raw_hidden_states,
                                             self.transformer_args,
                                             self.trainable,
                                             training=training)

    def get_output_dim(self):
        config = AutoConfig_.from_pretrained(self.transformer)
        return config.hidden_size

    def get_tokenizer(self):
        return self._transformer_tokenizer


def find_transformer(embed: nn.Module):
    if isinstance(embed, ContextualWordEmbeddingModule):
        return embed
    if isinstance(embed, nn.ModuleList):
        for child in embed:
            found = find_transformer(child)
            if found:
                return found


================================================
FILE: hanlp/layers/embeddings/embedding.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-02 13:04
from abc import ABC, abstractmethod
from typing import Callable, List, Optional, Iterable

import torch
from torch import nn
from torch.nn import Module

from hanlp_common.configurable import AutoConfigurable
from hanlp.common.transform import TransformList
from hanlp.layers.dropout import IndependentDropout


class EmbeddingDim(ABC):
    @property
    @abstractmethod
    def embedding_dim(self) -> int:
        return -1

    def get_output_dim(self) -> int:
        return self.embedding_dim


class Embedding(AutoConfigurable, ABC):

    def __init__(self) -> None:
        """
        Base class for embedding builders.
        """
        super().__init__()

    def transform(self, **kwargs) -> Optional[Callable]:
        """Build a transform function for this embedding.

        Args:
            **kwargs: Containing vocabs, training etc. Not finalized for now.

        Returns:
            A transform function.
        """
        return None

    def module(self, **kwargs) -> Optional[nn.Module]:
        """Build a module for this embedding.

        Args:
            **kwargs: Containing vocabs, training etc. Not finalized for now.

        Returns:
            A module.
        """
        return None


class ConcatModuleList(nn.ModuleList, EmbeddingDim):

    def __init__(self, *modules: Optional[Iterable[Module]], dropout=None) -> None:
        """A ``nn.ModuleList`` to bundle several embeddings modules.

        Args:
            *modules: Embedding layers.
            dropout: Dropout applied on the concatenated embedding.
        """
        super().__init__(*modules)
        if dropout:
            dropout = IndependentDropout(p=dropout)
        self.dropout = dropout

    @property
    def embedding_dim(self) -> int:
        return sum(embed.embedding_dim for embed in self)

    def get_output_dim(self) -> int:
        return sum(embed.get_output_dim() for embed in self)

    # noinspection PyMethodOverriding
    def forward(self, batch: dict, **kwargs):
        embeds = [embed(batch, **kwargs) for embed in self.embeddings]
        if self.dropout:
            embeds = self.dropout(*embeds)
        return torch.cat(embeds, -1)

    @property
    def embeddings(self):
        embeddings = [x for x in self]
        if self.dropout:
            embeddings.remove(self.dropout)
        return embeddings


class EmbeddingList(Embedding):
    def __init__(self, *embeddings_, embeddings: dict = None, dropout=None) -> None:
        """An embedding builder to bundle several embedding builders.

        Args:
            *embeddings_: A list of embedding builders.
            embeddings: Deserialization for a dict of embedding builders.
            dropout: Dropout applied on the concatenated embedding.
        """
        # noinspection PyTypeChecker
        self.dropout = dropout
        self._embeddings: List[Embedding] = list(embeddings_)
        if embeddings:
            for each in embeddings:
                if isinstance(each, dict):
                    each = AutoConfigurable.from_config(each)
                self._embeddings.append(each)
        self.embeddings = [e.config for e in self._embeddings]

    def transform(self, **kwargs):
        transforms = [e.transform(**kwargs) for e in self._embeddings]
        transforms = [t for t in transforms if t]
        return TransformList(*transforms)

    def module(self, **kwargs):
        modules = [e.module(**kwargs) for e in self._embeddings]
        modules = [m for m in modules if m]
        return ConcatModuleList(modules, dropout=self.dropout)

    def to_list(self):
        return self._embeddings


def find_embedding_by_class(embed: Embedding, cls):
    if isinstance(embed, cls):
        return embed
    if isinstance(embed, EmbeddingList):
        for child in embed.to_list():
            found = find_embedding_by_class(child, cls)
            if found:
                return found


================================================
FILE: hanlp/layers/embeddings/fast_text.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-27 15:06
import logging
import os
import sys
from typing import Optional, Callable

import fasttext
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence

from hanlp_common.configurable import AutoConfigurable
from torch.utils.data import DataLoader

from hanlp.common.dataset import PadSequenceDataLoader, TransformableDataset
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import EmbeddingNamedTransform
from hanlp.common.vocab import Vocab
from hanlp.layers.embeddings.embedding import Embedding
from hanlp.utils.io_util import get_resource, stdout_redirected
from hanlp.utils.log_util import flash


class FastTextTransform(EmbeddingNamedTransform):
    def __init__(self, filepath: str, src, dst=None, **kwargs) -> None:
        if not dst:
            dst = src + '_fasttext'
        self.filepath = filepath
        flash(f'Loading fasttext model {filepath} [blink][yellow]...[/yellow][/blink]')
        filepath = get_resource(filepath)
        with stdout_redirected(to=os.devnull, stdout=sys.stderr):
            self._model = fasttext.load_model(filepath)
        flash('')
        output_dim = self._model['king'].size
        super().__init__(output_dim, src, dst)

    def __call__(self, sample: dict):
        word = sample[self.src]
        if isinstance(word, str):
            vector = self.embed(word)
        else:
            vector = torch.stack([self.embed(each) for each in word])
        sample[self.dst] = vector
        return sample

    def embed(self, word: str):
        return torch.tensor(self._model[word])


class SelectFromBatchModule(torch.nn.Module):
    def __init__(self, key) -> None:
        super().__init__()
        self.key = key

    def __call__(self, batch: dict, mask=None, **kwargs):
        return batch[self.key]


class FastTextEmbeddingModule(SelectFromBatchModule):

    def __init__(self, key, embedding_dim: int) -> None:
        """An embedding layer for fastText (:cite:`bojanowski2017enriching`).

        Args:
            key: Field name.
            embedding_dim: Size of this embedding layer
        """
        super().__init__(key)
        self.embedding_dim = embedding_dim

    def __call__(self, batch: dict, mask=None, **kwargs):
        outputs = super().__call__(batch, **kwargs)
        outputs = pad_sequence(outputs, True, 0)
        if mask is not None:
            outputs = outputs.to(mask.device)
        return outputs

    def __repr__(self):
        s = self.__class__.__name__ + '('
        s += f'key={self.key}, embedding_dim={self.embedding_dim}'
        s += ')'
        return s

    def get_output_dim(self):
        return self.embedding_dim


class FastTextEmbedding(Embedding, AutoConfigurable):
    def __init__(self, src: str, filepath: str) -> None:
        """An embedding layer builder for fastText (:cite:`bojanowski2017enriching`).

        Args:
            src: Field name.
            filepath: Filepath to pretrained fastText embeddings.
        """
        super().__init__()
        self.src = src
        self.filepath = filepath
        self._fasttext = FastTextTransform(self.filepath, self.src)

    def transform(self, **kwargs) -> Optional[Callable]:
        return self._fasttext

    def module(self, **kwargs) -> Optional[nn.Module]:
        return FastTextEmbeddingModule(self._fasttext.dst, self._fasttext.output_dim)


class FastTextDataset(TransformableDataset):

    def load_file(self, filepath: str):
        raise NotImplementedError('Not supported.')


class FastTextEmbeddingComponent(TorchComponent):
    def __init__(self, **kwargs) -> None:
        """ Toy example of Word2VecEmbedding. It simply returns the embedding of a given word

        Args:
            **kwargs:
        """
        super().__init__(**kwargs)

    def build_dataloader(self, data, shuffle=False, device=None, logger: logging.Logger = None,
                         **kwargs) -> DataLoader:
        embed: FastTextEmbedding = self.config.embed
        dataset = FastTextDataset([{'token': data}], transform=embed.transform())
        return PadSequenceDataLoader(dataset, device=device)

    def build_optimizer(self, **kwargs):
        raise NotImplementedError('Not supported.')

    def build_criterion(self, **kwargs):
        raise NotImplementedError('Not supported.')

    def build_metric(self, **kwargs):
        raise NotImplementedError('Not supported.')

    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
                              logger: logging.Logger, devices, ratio_width=None, **kwargs):
        raise NotImplementedError('Not supported.')

    def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
        raise NotImplementedError('Not supported.')

    def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs):
        raise NotImplementedError('Not supported.')

    def load_vocabs(self, save_dir, filename='vocabs.json'):
        pass

    def load_weights(self, save_dir, filename='model.pt', **kwargs):
        pass

    def build_model(self, training=True, **kwargs) -> torch.nn.Module:
        embed: FastTextEmbedding = self.config.embed
        return embed.module()

    def predict(self, data: str, **kwargs):
        dataloader = self.build_dataloader(data, device=self.device)
        for batch in dataloader:  # It's a toy so doesn't really do batching
            return self.model(batch)[0]

    @property
    def devices(self):
        return [torch.device('cpu')]


================================================
FILE: hanlp/layers/embeddings/fast_text_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-29 13:14
import os
import sys

import numpy as np
import tensorflow as tf
from tensorflow.python.keras.utils import tf_utils

from hanlp_common.constant import PAD
from hanlp.utils.io_util import get_resource, stdout_redirected
from hanlp.utils.log_util import logger
from hanlp.utils.tf_util import hanlp_register


@hanlp_register
class FastTextEmbeddingTF(tf.keras.layers.Embedding):

    def __init__(self, filepath: str, padding=PAD, name=None, **kwargs):
        import fasttext
        self.padding = padding.encode('utf-8')
        self.filepath = filepath
        filepath = get_resource(filepath)
        assert os.path.isfile(filepath), f'Resolved path {filepath} is not a file'
        logger.debug('Loading fasttext model from [{}].'.format(filepath))
        # fasttext print a blank line here
        with stdout_redirected(to=os.devnull, stdout=sys.stderr):
            self.model = fasttext.load_model(filepath)
        kwargs.pop('input_dim', None)
        kwargs.pop('output_dim', None)
        kwargs.pop('mask_zero', None)
        if not name:
            name = os.path.splitext(os.path.basename(filepath))[0]
        super().__init__(input_dim=len(self.model.words), output_dim=self.model['king'].size,
                         mask_zero=padding is not None, trainable=False, dtype=tf.string, name=name, **kwargs)
        embed_fn = np.frompyfunc(self.embed, 1, 1)
        # vf = np.vectorize(self.embed, otypes=[np.ndarray])
        self._embed_np = embed_fn

    def embed(self, word):
        return self.model[word]

    def embed_np(self, words: np.ndarray):
        output = self._embed_np(words)
        if self.mask_zero:
            mask = words != self.padding
            output *= mask
            output = np.stack(output.reshape(-1)).reshape(list(words.shape) + [self.output_dim])
            return output, tf.constant(mask)
        else:
            output = np.stack(output.reshape(-1)).reshape(list(words.shape) + [self.output_dim])
            return output

    @tf_utils.shape_type_conversion
    def build(self, input_shape):
        self.built = True

    @tf_utils.shape_type_conversion
    def compute_output_shape(self, input_shape):
        return input_shape + (self.output_dim,)

    def call(self, inputs: tf.Tensor):
        if isinstance(inputs, list):
            inputs = inputs[0]
        if not hasattr(inputs, 'numpy'):  # placeholder tensor
            inputs = tf.expand_dims(inputs, axis=-1)
            inputs = tf.tile(inputs, [1] * (len(inputs.shape) - 1) + [self.output_dim])
            inputs = tf.zeros_like(inputs, dtype=tf.float32)
            return inputs
            # seq_len = inputs.shape[-1]
            # if not seq_len:
            #     seq_len = 1
            # return tf.zeros([1, seq_len, self.output_dim])
        if self.mask_zero:
            outputs, masks = self.embed_np(inputs.numpy())
            outputs = tf.constant(outputs)
            outputs._keras_mask = masks
        else:
            outputs = self.embed_np(inputs.numpy())
            outputs = tf.constant(outputs)
        return outputs

    def compute_mask(self, inputs, mask=None):
        if not self.mask_zero:
            return None
        return tf.not_equal(inputs, self.padding)

    def get_config(self):
        config = {
            'filepath': self.filepath,
            'padding': self.padding.decode('utf-8')
        }
        base_config = super(FastTextEmbeddingTF, self).get_config()
        for junk in 'embeddings_initializer' \
                , 'batch_input_shape' \
                , 'embeddings_regularizer' \
                , 'embeddings_constraint' \
                , 'activity_regularizer' \
                , 'trainable' \
                , 'input_length' \
                :
            base_config.pop(junk)
        return dict(list(base_config.items()) + list(config.items()))


================================================
FILE: hanlp/layers/embeddings/util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-09 15:45
from typing import Union

import torch
from torch import nn

from hanlp.common.vocab import Vocab
from hanlp.utils.init_util import embedding_uniform
from hanlp.utils.torch_util import load_word2vec, load_word2vec_as_vocab_tensor


def index_word2vec_with_vocab(filepath: str,
                              vocab: Vocab,
                              extend_vocab=True,
                              unk=None,
                              lowercase=False,
                              init='uniform',
                              normalize=None) -> torch.Tensor:
    """

    Args:
        filepath: The path to pretrained embedding.
        vocab: The vocabulary from training set.
        extend_vocab: Unlock vocabulary of training set to add those tokens in pretrained embedding file.
        unk: UNK token.
        lowercase: Convert words in pretrained embeddings into lowercase.
        init: Indicate which initialization to use for oov tokens.
        normalize: ``True`` or a method to normalize the embedding matrix.

    Returns:
        An embedding matrix.

    """
    pret_vocab, pret_matrix = load_word2vec_as_vocab_tensor(filepath)
    if unk and unk in pret_vocab:
        pret_vocab[vocab.safe_unk_token] = pret_vocab.pop(unk)
    if extend_vocab:
        vocab.unlock()
        for word in pret_vocab:
            vocab.get_idx(word.lower() if lowercase else word)
    vocab.lock()
    ids = []

    unk_id_offset = 0
    for word, idx in vocab.token_to_idx.items():
        word_id = pret_vocab.get(word, None)
        # Retry lower case
        if word_id is None:
            word_id = pret_vocab.get(word.lower(), None)
        if word_id is None:
            word_id = len(pret_vocab) + unk_id_offset
            unk_id_offset += 1
        ids.append(word_id)
    if unk_id_offset:
        unk_embeds = torch.zeros(unk_id_offset, pret_matrix.size(1))
        if init and init != 'zeros':
            if init == 'uniform':
                init = embedding_uniform
            else:
                raise ValueError(f'Unsupported init {init}')
            unk_embeds = init(unk_embeds)
        pret_matrix = torch.cat([pret_matrix, unk_embeds])
    ids = torch.LongTensor(ids)
    embedding = pret_matrix.index_select(0, ids)
    if normalize == 'norm':
        embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12)
    elif normalize == 'l2':
        embedding = torch.nn.functional.normalize(embedding, p=2, dim=1)
    elif normalize == 'std':
        embedding /= torch.std(embedding)
    else:
        raise ValueError(f'Unsupported normalization method {normalize}')
    return embedding


def build_word2vec_with_vocab(embed: Union[str, int],
                              vocab: Vocab,
                              extend_vocab=True,
                              unk=None,
                              lowercase=False,
                              trainable=False,
                              init='zeros',
                              normalize=None) -> nn.Embedding:
    """Build word2vec embedding and a vocab.

    Args:
        embed:
        vocab: The vocabulary from training set.
        extend_vocab: Unlock vocabulary of training set to add those tokens in pretrained embedding file.
        unk: UNK token.
        lowercase: Convert words in pretrained embeddings into lowercase.
        trainable: ``False`` to use static embeddings.
        init: Indicate which initialization to use for oov tokens.
        normalize: ``True`` or a method to normalize the embedding matrix.

    Returns:
        An embedding matrix.

    """
    if isinstance(embed, str):
        embed = index_word2vec_with_vocab(embed, vocab, extend_vocab, unk, lowercase, init, normalize)
        embed = nn.Embedding.from_pretrained(embed, freeze=not trainable, padding_idx=vocab.pad_idx)
        return embed
    elif isinstance(embed, int):
        embed = nn.Embedding(len(vocab), embed, padding_idx=vocab.pad_idx)
        return embed
    else:
        raise ValueError(f'Unsupported parameter type: {embed}')


================================================
FILE: hanlp/layers/embeddings/util_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-09 15:46
from typing import Union

import tensorflow as tf

from hanlp.common.transform_tf import Transform
from hanlp.common.vocab_tf import VocabTF
from hanlp.layers.embeddings.char_cnn_tf import CharCNNEmbeddingTF
from hanlp.layers.embeddings.char_rnn_tf import CharRNNEmbeddingTF
from hanlp.layers.embeddings.concat_embedding import ConcatEmbedding
from hanlp.layers.embeddings.contextual_string_embedding_tf import ContextualStringEmbeddingTF
from hanlp.layers.embeddings.fast_text_tf import FastTextEmbeddingTF
from hanlp.layers.embeddings.word2vec_tf import Word2VecEmbeddingTF, StringWord2VecEmbeddingTF, Word2VecEmbeddingV1

_upgrade = tf.keras.utils.get_custom_objects()
for k, v in list(_upgrade.items()):
    if k.startswith('HanLP>') and k.endswith('TF'):
        _upgrade[k[:-2]] = v


def build_embedding(embeddings: Union[str, int, dict], word_vocab: VocabTF, transform: Transform):
    if not embeddings:
        return None
    config = transform.config
    if isinstance(embeddings, int):
        embeddings = tf.keras.layers.Embedding(input_dim=len(word_vocab), output_dim=embeddings,
                                               trainable=True, mask_zero=True)
        config.embedding_trainable = True
    elif isinstance(embeddings, dict):
        # Upgrade to 2.1
        embed_name = embeddings['class_name'].split('>')[-1]
        if embeddings['class_name'].startswith('HanLP>') and not embeddings['class_name'].endswith('TF'):
            embed_name += 'TF'
        # Embeddings need vocab
        if embed_name in (Word2VecEmbeddingTF.__name__, StringWord2VecEmbeddingTF.__name__):
            # Vocab won't present in the dict
            embeddings['config']['vocab'] = word_vocab
        elif embed_name in (CharRNNEmbeddingTF.__name__, CharCNNEmbeddingTF.__name__):
            embeddings['config']['word_vocab'] = word_vocab
            embeddings['config']['char_vocab'] = transform.char_vocab
            transform.map_x = False
        layer: tf.keras.layers.Embedding = tf.keras.utils.deserialize_keras_object(embeddings)
        # Embedding specific configuration
        if layer.__class__.__name__ in ('FastTextEmbedding', 'FastTextEmbeddingTF'):
            config.run_eagerly = True  # fasttext can only run in eager mode
            config.embedding_trainable = False
            transform.map_x = False  # fasttext accept string instead of int
        return layer
    elif isinstance(embeddings, list):
        if embeddings_require_string_input(embeddings):
            # those embeddings require string as input
            transform.map_x = False
            # use the string version of Word2VecEmbedding instead
            for embed in embeddings:
                if embed['class_name'].split('>')[-1] == Word2VecEmbeddingTF.__name__:
                    embed['class_name'] = 'HanLP>' + StringWord2VecEmbeddingTF.__name__
        return ConcatEmbedding(*[build_embedding(embed, word_vocab, transform) for embed in embeddings])
    else:
        assert isinstance(embeddings, str), 'embedding should be str or int or dict'
        # word_vocab.unlock()
        embeddings = Word2VecEmbeddingV1(path=embeddings, vocab=word_vocab,
                                         trainable=config.get('embedding_trainable', False))
        embeddings = embeddings.array_ks
    return embeddings


def any_embedding_in(embeddings, *cls):
    names = set(x.__name__ for x in cls)
    names.update(list(x[:-2] for x in names if x.endswith('TF')))
    for embed in embeddings:
        if isinstance(embed, dict) and embed['class_name'].split('>')[-1] in names:
            return True
    return False


def embeddings_require_string_input(embeddings):
    if not isinstance(embeddings, list):
        embeddings = [embeddings]
    return any_embedding_in(embeddings, CharRNNEmbeddingTF, CharCNNEmbeddingTF, FastTextEmbeddingTF,
                            ContextualStringEmbeddingTF)


def embeddings_require_char_input(embeddings):
    if not isinstance(embeddings, list):
        embeddings = [embeddings]
    return any_embedding_in(embeddings, CharRNNEmbeddingTF, CharCNNEmbeddingTF, ContextualStringEmbeddingTF)


================================================
FILE: hanlp/layers/embeddings/word2vec.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-09 13:38
import logging
import math
import os.path
from typing import Optional, Callable, Union, List, Dict

import torch
from hanlp_common.configurable import AutoConfigurable
from hanlp_common.constant import HANLP_VERBOSE
from hanlp_trie.trie import Trie
from torch import nn
from torch.utils.data import DataLoader

from hanlp.common.dataset import TransformableDataset, PadSequenceDataLoader
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import VocabDict
from hanlp.common.vocab import Vocab
from hanlp.layers.dropout import WordDropout
from hanlp.layers.embeddings.embedding import Embedding, EmbeddingDim
from hanlp.layers.embeddings.util import build_word2vec_with_vocab
from hanlp.utils.log_util import flash
from hanlp.utils.torch_util import load_word2vec_as_vocab_tensor


class Word2VecEmbeddingModule(nn.Module, EmbeddingDim):
    def __init__(self, field: str, embed: nn.Embedding, word_dropout: WordDropout = None, cpu=False,
                 second_channel=False, num_tokens_in_trn=None, unk_idx=1) -> None:
        """A word2vec style embedding module which maps a token to its embedding through looking up a pre-defined table.

        Args:
            field: The field to work on. Usually some token fields.
            embed: An ``Embedding`` layer.
            word_dropout: The probability of randomly replacing a token with ``UNK``.
            cpu: Reside on CPU instead of GPU.
            second_channel: A trainable second channel for each token, which will be added to pretrained embeddings.
            num_tokens_in_trn: The number of tokens in training set.
            unk_idx: The index of ``UNK``.
        """
        super().__init__()
        self.cpu = cpu
        self.field = field
        self.embed = embed
        self.word_dropout = word_dropout
        self.num_tokens_in_trn = num_tokens_in_trn
        self.unk_idx = unk_idx
        if second_channel:
            n_words, n_embed = embed.weight.size()
            if num_tokens_in_trn:
                n_words = num_tokens_in_trn
            second_channel = nn.Embedding(num_embeddings=n_words,
                                          embedding_dim=n_embed)
            nn.init.zeros_(second_channel.weight)
        self.second_channel = second_channel

    def forward(self, batch: dict, **kwargs):
        x: torch.Tensor = batch[f'{self.field}_id']
        if self.cpu:
            device = x.device
            x = x.cpu()
        if self.word_dropout:
            x = self.word_dropout(x)
        if self.second_channel:
            ext_mask = x.ge(self.second_channel.num_embeddings)
            ext_words = x.masked_fill(ext_mask, self.unk_idx)
            x = self.embed(x) + self.second_channel(ext_words)
        else:
            x = self.embed(x)
        if self.cpu:
            # noinspection PyUnboundLocalVariable
            x = x.to(device)
        return x

    @property
    def embedding_dim(self) -> int:
        return self.embed.embedding_dim

    # noinspection PyMethodOverriding
    # def to(self, device, **kwargs):
    #     print(self.cpu)
    #     exit(1)
    #     if self.cpu:
    #         return super(Word2VecEmbeddingModule, self).to(-1, **kwargs)
    #     return super(Word2VecEmbeddingModule, self).to(device, **kwargs)

    def _apply(self, fn):

        if not self.cpu:  # This might block all fn not limiting to moving between devices.
            return super(Word2VecEmbeddingModule, self)._apply(fn)


class Word2VecEmbedding(Embedding, AutoConfigurable):
    def __init__(self,
                 field,
                 embed: Union[int, str],
                 extend_vocab=True,
                 pad=None,
                 unk=None,
                 lowercase=False,
                 trainable=False,
                 second_channel=False,
                 word_dropout: float = 0,
                 normalize=False,
                 cpu=False,
                 init='zeros') -> None:
        """A word2vec style embedding builder which maps a token to its embedding through looking up a pre-defined
        table.

        Args:
            field: The field to work on. Usually some token fields.
            embed: A path to pre-trained embedding file or an integer defining the size of randomly initialized
                embedding.
            extend_vocab: Unlock vocabulary of training set to add those tokens in pre-trained embedding file.
            pad: The padding token.
            unk: The unknown token.
            lowercase: Convert words in pretrained embeddings into lowercase.
            trainable: ``False`` to use static embeddings.
            second_channel: A trainable second channel for each token, which will be added to pretrained embeddings.
            word_dropout: The probability of randomly replacing a token with ``UNK``.
            normalize: ``l2`` or ``std`` to normalize the embedding matrix.
            cpu: Reside on CPU instead of GPU.
            init: Indicate which initialization to use for oov tokens.
        """
        super().__init__()
        self.pad = pad
        self.second_channel = second_channel
        self.cpu = cpu
        self.normalize = normalize
        self.word_dropout = word_dropout
        self.init = init
        self.lowercase = lowercase
        self.unk = unk
        self.extend_vocab = extend_vocab
        self.trainable = trainable
        self.embed = embed
        self.field = field

    def module(self, vocabs: VocabDict, **kwargs) -> Optional[nn.Module]:
        vocab = vocabs[self.field]
        num_tokens_in_trn = len(vocab)
        embed = build_word2vec_with_vocab(self.embed,
                                          vocab,
                                          self.extend_vocab,
                                          self.unk,
                                          self.lowercase,
                                          self.trainable,
                                          normalize=self.normalize)
        if self.word_dropout:
            assert vocab.unk_token, f'unk_token of vocab {self.field} has to be set in order to ' \
                                    f'make use of word_dropout'
            padding = []
            if vocab.pad_token:
                padding.append(vocab.pad_idx)
            word_dropout = WordDropout(self.word_dropout, vocab.unk_idx, exclude_tokens=padding)
        else:
            word_dropout = None
        return Word2VecEmbeddingModule(self.field, embed, word_dropout=word_dropout, cpu=self.cpu,
                                       second_channel=self.second_channel, num_tokens_in_trn=num_tokens_in_trn,
                                       unk_idx=vocab.unk_idx)

    def transform(self, vocabs: VocabDict = None, **kwargs) -> Optional[Callable]:
        assert vocabs is not None
        if self.field not in vocabs:
            vocabs[self.field] = Vocab(pad_token=self.pad, unk_token=self.unk)
        return super().transform(**kwargs)


class Word2VecDataset(TransformableDataset):

    def load_file(self, filepath: str):
        raise NotImplementedError('Not supported.')


class Word2VecEmbeddingComponent(TorchComponent):

    def __init__(self, **kwargs) -> None:
        """ Toy example of Word2VecEmbedding. It simply returns the embedding of a given word

        Args:
            **kwargs:
        """
        super().__init__(**kwargs)
        self._tokenizer: Trie = None

    def build_dataloader(self, data: List[str], shuffle=False, device=None, logger: logging.Logger = None,
                         doc2vec=False, batch_size=32, **kwargs) -> DataLoader:
        dataset = Word2VecDataset([{'token': x} for x in data], transform=self._tokenize if doc2vec else self.vocabs)
        return PadSequenceDataLoader(dataset, device=device, batch_size=batch_size)

    def build_optimizer(self, **kwargs):
        raise NotImplementedError('Not supported.')

    def build_criterion(self, **kwargs):
        raise NotImplementedError('Not supported.')

    def build_metric(self, **kwargs):
        raise NotImplementedError('Not supported.')

    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
                              logger: logging.Logger, devices, ratio_width=None, **kwargs):
        raise NotImplementedError('Not supported.')

    def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
        raise NotImplementedError('Not supported.')

    def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs):
        raise NotImplementedError('Not supported.')

    def load_vocabs(self, save_dir, filename='vocabs.json'):
        self.vocabs['token'] = Vocab()

    def load_weights(self, save_dir, filename='model.pt', **kwargs):
        pass

    def build_model(self, training=True, **kwargs) -> torch.nn.Module:
        self._tokenizer = None
        embed: Word2VecEmbedding = self.config.embed
        model = embed.module(self.vocabs)
        return model

    def predict(self, word: str, doc2vec=False, **kwargs):
        dataloader = self.build_dataloader([word], device=self.device, doc2vec=doc2vec)
        for batch in dataloader:  # It's a toy so doesn't really do batching
            embeddings = self.model(batch)[0]
            if doc2vec:
                embeddings = embeddings[0].mean(dim=0)
            return embeddings

    @torch.no_grad()
    def most_similar(self, words: Union[str, List[str]], topk=10, doc2vec=False, similarity_less_than=None,
                     batch_size=32) -> Union[Dict[str, float], List[Dict[str, float]]]:
        """Find the `topk` most similar words of a given word or phrase.

        Args:
            words: A word or phrase or multiple words/phrases.
            topk: Number of top similar words.
            doc2vec: Enable doc2vec model for processing OOV and phrases.
            similarity_less_than: Only return words with a similarity less than this value.
            batch_size: Number of words or phrases per batch.

        Returns:
            Similar words and similarities stored in a dict.
        """
        flat = isinstance(words, str)
        if flat:
            words = [words]
        dataloader = self.build_dataloader(words, device=self.device, doc2vec=doc2vec, batch_size=batch_size)
        results = []
        vocab = self.vocabs['token']
        for batch in dataloader:
            embeddings = self.model(batch)
            token_id = batch['token_id']
            if doc2vec:
                lens = token_id.count_nonzero(dim=1)
                embeddings = embeddings.sum(1)
                embeddings = embeddings / lens.unsqueeze(1)
                block_word_id = batch['block_word_id']
                token_is_unk = (lens == 1) & (token_id[:, 0] == vocab.unk_idx)
            else:
                block_word_id = token_id
                token_is_unk = token_id == vocab.unk_idx
            similarities = torch.nn.functional.cosine_similarity(embeddings.unsqueeze(1), self.model.embed.weight,
                                                                 dim=-1)
            if similarity_less_than is not None:
                similarities[similarities > similarity_less_than] = -math.inf
            similarities[torch.arange(similarities.size(0), device=self.device), block_word_id] = -math.inf
            scores, indices = similarities.topk(topk)

            for sc, idx, unk in zip(scores.tolist(), indices.tolist(), token_is_unk.tolist()):
                results.append(dict() if unk else dict(zip([vocab.idx_to_token[i] for i in idx], sc)))
        if flat:
            results = results[0]
        return results

    def _tokenize(self, sample: dict) -> dict:
        tokens = sample['token']
        ids = [idx for b, e, idx in self.tokenizer.parse_longest(tokens)]
        vocab = self.vocabs['token']
        if not ids:
            ids = [vocab.unk_idx]
        sample['token_id'] = ids
        sample['block_word_id'] = ids[0] if len(ids) == 1 else vocab.pad_idx
        return sample

    @property
    def tokenizer(self):
        if not self._tokenizer:
            if HANLP_VERBOSE:
                flash('Building Trie-based tokenizer for Doc2Vec [blink][yellow]...[/yellow][/blink]')
            self._tokenizer = Trie(self.vocabs['token'].token_to_idx)
            if HANLP_VERBOSE:
                flash('')
        return self._tokenizer

    def load_config(self, save_dir, filename='config.json', **kwargs):
        if os.path.isfile(save_dir):
            self.config.update({'classpath': 'hanlp.layers.embeddings.word2vec.Word2VecEmbeddingComponent',
                                'embed': Word2VecEmbedding(field='token', embed=save_dir, normalize='l2')})
            return
        super().load_config(save_dir, filename, **kwargs)


class GazetterTransform(object):
    def __init__(self, field, words: dict) -> None:
        super().__init__()
        self.field = field
        self.trie = Trie()
        for word, idx in words.items():
            self.trie[word] = idx

    def __call__(self, sample: dict) -> dict:
        tokens = sample[self.field]
        lexicons = self.trie.parse(tokens)
        skips_l2r = [[] for _ in range(len(tokens))]
        skips_r2l = [[] for _ in range(len(tokens))]
        for w, i, s, e in lexicons:
            e = e - 1
            skips_l2r[e].append((s, w, i))
            skips_r2l[s].append((e, w, i))
        for direction, value in zip(['skips_l2r', 'skips_r2l'], [skips_l2r, skips_r2l]):
            sample[f'{self.field}_{direction}_offset'] = [list(map(lambda x: x[0], p)) for p in value]
            sample[f'{self.field}_{direction}_id'] = [list(map(lambda x: x[-1], p)) for p in value]
            sample[f'{self.field}_{direction}_count'] = list(map(len, value))
        return sample


class GazetteerEmbedding(Embedding, AutoConfigurable):
    def __init__(self, embed: str, field='char', trainable=False) -> None:
        self.trainable = trainable
        self.embed = embed
        self.field = field
        vocab, matrix = load_word2vec_as_vocab_tensor(self.embed)
        ids = []
        _vocab = {}
        for word, idx in vocab.items():
            if len(word) > 1:
                ids.append(idx)
                _vocab[word] = len(_vocab)
        ids = torch.tensor(ids)
        _matrix = matrix.index_select(0, ids)
        self._vocab = _vocab
        self._matrix = _matrix

    def transform(self, **kwargs) -> Optional[Callable]:
        return GazetterTransform(self.field, self._vocab)

    def module(self, **kwargs) -> Optional[nn.Module]:
        embed = nn.Embedding.from_pretrained(self._matrix, freeze=not self.trainable)
        return embed

    @staticmethod
    def _remove_short_tokens(word2vec):
        word2vec = dict((w, v) for w, v in word2vec.items() if len(w) > 1)
        return word2vec


================================================
FILE: hanlp/layers/embeddings/word2vec_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-24 21:49
import os
from typing import Tuple, Union, List

import numpy as np
import tensorflow as tf
from tensorflow.python.ops import math_ops

from hanlp.common.vocab_tf import VocabTF
from hanlp.utils.io_util import get_resource
from hanlp.utils.torch_util import load_word2vec
from hanlp.utils.tf_util import hanlp_register
from hanlp_common.util import DummyContext


class Word2VecEmbeddingV1(tf.keras.layers.Layer):
    def __init__(self, path: str = None, vocab: VocabTF = None, normalize: bool = False, load_all=True, mask_zero=True,
                 trainable=False, name=None, dtype=None, dynamic=False, **kwargs):
        super().__init__(trainable, name, dtype, dynamic, **kwargs)
        if load_all and vocab and vocab.locked:
            vocab.unlock()
        self.vocab, self.array_np = self._load(path, vocab, normalize)
        self.vocab.lock()
        self.array_ks = tf.keras.layers.Embedding(input_dim=len(self.vocab), output_dim=self.dim, trainable=trainable,
                                                  embeddings_initializer=tf.keras.initializers.Constant(self.array_np),
                                                  mask_zero=mask_zero)
        self.mask_zero = mask_zero
        self.supports_masking = mask_zero

    def compute_mask(self, inputs, mask=None):
        if not self.mask_zero:
            return None

        return math_ops.not_equal(inputs, self.vocab.pad_idx)

    def call(self, inputs, **kwargs):
        return self.array_ks(inputs, **kwargs)

    def compute_output_shape(self, input_shape):
        return input_shape[0], self.dim

    @staticmethod
    def _load(path, vocab, normalize=False) -> Tuple[VocabTF, Union[np.ndarray, None]]:
        if not vocab:
            vocab = VocabTF()
        if not path:
            return vocab, None
        assert vocab.unk_idx is not None

        word2vec, dim = load_word2vec(path)
        for word in word2vec:
            vocab.get_idx(word)

        pret_embs = np.zeros(shape=(len(vocab), dim), dtype=np.float32)
        state = np.random.get_state()
        np.random.seed(0)
        bias = np.random.uniform(low=-0.001, high=0.001, size=dim).astype(dtype=np.float32)
        scale = np.sqrt(3.0 / dim)
        for word, idx in vocab.token_to_idx.items():
            vec = word2vec.get(word, None)
            if vec is None:
                vec = word2vec.get(word.lower(), None)
                # if vec is not None:
                #     vec += bias
            if vec is None:
                # vec = np.random.uniform(-scale, scale, [dim])
                vec = np.zeros([dim], dtype=np.float32)
            pret_embs[idx] = vec
        # noinspection PyTypeChecker
        np.random.set_state(state)
        return vocab, pret_embs

    @property
    def size(self):
        if self.array_np is not None:
            return self.array_np.shape[0]

    @property
    def dim(self):
        if self.array_np is not None:
            return self.array_np.shape[1]

    @property
    def shape(self):
        if self.array_np is None:
            return None
        return self.array_np.shape

    def get_vector(self, word: str) -> np.ndarray:
        assert self.array_np is not None
        return self.array_np[self.vocab.get_idx_without_add(word)]

    def __getitem__(self, word: Union[str, List, tf.Tensor]) -> np.ndarray:
        if isinstance(word, str):
            return self.get_vector(word)
        elif isinstance(word, list):
            vectors = np.zeros(shape=(len(word), self.dim))
            for idx, token in enumerate(word):
                vectors[idx] = self.get_vector(token)
            return vectors
        elif isinstance(word, tf.Tensor):
            if word.dtype == tf.string:
                word_ids = self.vocab.token_to_idx_table.lookup(word)
                return tf.nn.embedding_lookup(self.array_tf, word_ids)
            elif word.dtype == tf.int32 or word.dtype == tf.int64:
                return tf.nn.embedding_lookup(self.array_tf, word)


@hanlp_register
class Word2VecEmbeddingTF(tf.keras.layers.Embedding):

    def __init__(self, filepath: str = None, vocab: VocabTF = None, expand_vocab=True, lowercase=True,
                 input_dim=None, output_dim=None, unk=None, normalize=False,
                 embeddings_initializer='VarianceScaling',
                 embeddings_regularizer=None,
                 activity_regularizer=None, embeddings_constraint=None, mask_zero=True, input_length=None,
                 name=None, cpu=True, **kwargs):
        filepath = get_resource(filepath)
        word2vec, _output_dim = load_word2vec(filepath)
        if output_dim:
            assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}'
        output_dim = _output_dim
        # if the `unk` token exists in the pretrained,
        # then replace it with a self-defined one, usually the one in word vocab
        if unk and unk in word2vec:
            word2vec[vocab.safe_unk_token] = word2vec.pop(unk)
        if vocab is None:
            vocab = VocabTF()
            vocab.update(word2vec.keys())
        if expand_vocab and vocab.mutable:
            for word in word2vec:
                vocab.get_idx(word.lower() if lowercase else word)
        if input_dim:
            assert input_dim == len(vocab), f'input_dim = {input_dim} does not match {filepath}'
        input_dim = len(vocab)
        # init matrix
        self._embeddings_initializer = embeddings_initializer
        embeddings_initializer = tf.keras.initializers.get(embeddings_initializer)
        with tf.device('cpu:0') if cpu else DummyContext():
            pret_embs = embeddings_initializer(shape=[input_dim, output_dim]).numpy()
        # insert to pret_embs
        for word, idx in vocab.token_to_idx.items():
            vec = word2vec.get(word, None)
            # Retry lower case
            if vec is None and lowercase:
                vec = word2vec.get(word.lower(), None)
            if vec is not None:
                pret_embs[idx] = vec
        if normalize:
            pret_embs /= np.std(pret_embs)
        if not name:
            name = os.path.splitext(os.path.basename(filepath))[0]
        super().__init__(input_dim, output_dim, tf.keras.initializers.Constant(pret_embs), embeddings_regularizer,
                         activity_regularizer, embeddings_constraint, mask_zero, input_length, name=name, **kwargs)
        self.filepath = filepath
        self.expand_vocab = expand_vocab
        self.lowercase = lowercase

    def get_config(self):
        config = {
            'filepath': self.filepath,
            'expand_vocab': self.expand_vocab,
            'lowercase': self.lowercase,
        }
        base_config = super(Word2VecEmbeddingTF, self).get_config()
        base_config['embeddings_initializer'] = self._embeddings_initializer
        return dict(list(base_config.items()) + list(config.items()))


@hanlp_register
class StringWord2VecEmbeddingTF(Word2VecEmbeddingTF):

    def __init__(self, filepath: str = None, vocab: VocabTF = None, expand_vocab=True, lowercase=False, input_dim=None,
                 output_dim=None, unk=None, normalize=False, embeddings_initializer='VarianceScaling',
                 embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=True,
                 input_length=None, name=None, **kwargs):
        if vocab is None:
            vocab = VocabTF()
        self.vocab = vocab
        super().__init__(filepath, vocab, expand_vocab, lowercase, input_dim, output_dim, unk, normalize,
                         embeddings_initializer, embeddings_regularizer, activity_regularizer, embeddings_constraint,
                         mask_zero, input_length, name, **kwargs)

    def call(self, inputs):
        assert inputs.dtype == tf.string, \
            f'Expect tf.string but got tf.{inputs.dtype.name}. {inputs}' \
            f'Please pass tf.{inputs.dtype.name} in.'
        inputs = self.vocab.lookup(inputs)
        # inputs._keras_mask = tf.not_equal(inputs, self.vocab.pad_idx)
        return super().call(inputs)

    def compute_mask(self, inputs, mask=None):
        if not self.mask_zero:
            return None
        return tf.not_equal(inputs, self.vocab.pad_token)


================================================
FILE: hanlp/layers/feed_forward.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-06 14:37
from typing import Union, List

from hanlp.layers import feedforward

from hanlp.common.structure import ConfigTracker


class FeedForward(feedforward.FeedForward, ConfigTracker):
    def __init__(self, input_dim: int, num_layers: int, hidden_dims: Union[int, List[int]],
                 activations: Union[str, List[str]], dropout: Union[float, List[float]] = 0.0) -> None:
        super().__init__(input_dim, num_layers, hidden_dims, activations, dropout)
        ConfigTracker.__init__(self, locals())


================================================
FILE: hanlp/layers/feedforward.py
================================================
"""
A feed-forward neural network.
"""
from typing import List, Union

import torch
from hanlp.utils.torch_util import activation_from_name


class FeedForward(torch.nn.Module):
    """
    This `Module` is a feed-forward neural network, just a sequence of `Linear` layers with
    activation functions in between.

    # Parameters

    input_dim : `int`, required
        The dimensionality of the input.  We assume the input has shape `(batch_size, input_dim)`.
    num_layers : `int`, required
        The number of `Linear` layers to apply to the input.
    hidden_dims : `Union[int, List[int]]`, required
        The output dimension of each of the `Linear` layers.  If this is a single `int`, we use
        it for all `Linear` layers.  If it is a `List[int]`, `len(hidden_dims)` must be
        `num_layers`.
    activations : `Union[Activation, List[Activation]]`, required
        The activation function to use after each `Linear` layer.  If this is a single function,
        we use it after all `Linear` layers.  If it is a `List[Activation]`,
        `len(activations)` must be `num_layers`. Activation must have torch.nn.Module type.
    dropout : `Union[float, List[float]]`, optional (default = `0.0`)
        If given, we will apply this amount of dropout after each layer.  Semantics of `float`
        versus `List[float]` is the same as with other parameters.

    # Examples

    ```python
    FeedForward(124, 2, [64, 32], torch.nn.ReLU(), 0.2)
    #> FeedForward(
    #>   (_activations): ModuleList(
    #>     (0): ReLU()
    #>     (1): ReLU()
    #>   )
    #>   (_linear_layers): ModuleList(
    #>     (0): Linear(in_features=124, out_features=64, bias=True)
    #>     (1): Linear(in_features=64, out_features=32, bias=True)
    #>   )
    #>   (_dropout): ModuleList(
    #>     (0): Dropout(p=0.2, inplace=False)
    #>     (1): Dropout(p=0.2, inplace=False)
    #>   )
    #> )
    ```
    """

    def __init__(
            self,
            input_dim: int,
            num_layers: int,
            hidden_dims: Union[int, List[int]],
            activations: Union[str, List[str]],
            dropout: Union[float, List[float]] = 0.0,
    ) -> None:

        super().__init__()
        if not isinstance(hidden_dims, list):
            hidden_dims = [hidden_dims] * num_layers  # type: ignore
        if not isinstance(activations, list):
            activations = [activations] * num_layers  # type: ignore
        activations = [activation_from_name(a)() for a in activations]
        if not isinstance(dropout, list):
            dropout = [dropout] * num_layers  # type: ignore
        if len(hidden_dims) != num_layers:
            raise ValueError(
                "len(hidden_dims) (%d) != num_layers (%d)" % (len(hidden_dims), num_layers)
            )
        if len(activations) != num_layers:
            raise ValueError(
                "len(activations) (%d) != num_layers (%d)" % (len(activations), num_layers)
            )
        if len(dropout) != num_layers:
            raise ValueError(
                "len(dropout) (%d) != num_layers (%d)" % (len(dropout), num_layers)
            )
        self._activations = torch.nn.ModuleList(activations)
        input_dims = [input_dim] + hidden_dims[:-1]
        linear_layers = []
        for layer_input_dim, layer_output_dim in zip(input_dims, hidden_dims):
            linear_layers.append(torch.nn.Linear(layer_input_dim, layer_output_dim))
        self._linear_layers = torch.nn.ModuleList(linear_layers)
        dropout_layers = [torch.nn.Dropout(p=value) for value in dropout]
        self._dropout = torch.nn.ModuleList(dropout_layers)
        self._output_dim = hidden_dims[-1]
        self.input_dim = input_dim

    def get_output_dim(self):
        return self._output_dim

    def get_input_dim(self):
        return self.input_dim

    def forward(self, inputs: torch.Tensor) -> torch.Tensor:

        output = inputs
        for layer, activation, dropout in zip(
                self._linear_layers, self._activations, self._dropout
        ):
            output = dropout(activation(layer(output)))
        return output


================================================
FILE: hanlp/layers/scalar_mix.py
================================================
# This file is modified from udify, which is licensed under the MIT license:
# MIT License
#
# Copyright (c) 2019 Dan Kondratyuk
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
The dot-product "Layer Attention" that is applied to the layers of BERT, along with layer dropout to reduce overfitting
"""

from typing import List, Tuple

import torch
from torch.nn import ParameterList, Parameter

from hanlp.common.structure import ConfigTracker


class ScalarMixWithDropout(torch.nn.Module):
    """Computes a parameterised scalar mixture of N tensors, ``mixture = gamma * sum(s_k * tensor_k)``
    where ``s = softmax(w)``, with ``w`` and ``gamma`` scalar parameters.
    
    If ``do_layer_norm=True`` then apply layer normalization to each tensor before weighting.
    
    If ``dropout > 0``, then for each scalar weight, adjust its softmax weight mass to 0 with
    the dropout probability (i.e., setting the unnormalized weight to -inf). This effectively
    should redistribute dropped probability mass to all other weights.

    Args:

    Returns:

    """

    def __init__(self,
                 mixture_range: Tuple[int, int],
                 do_layer_norm: bool = False,
                 initial_scalar_parameters: List[float] = None,
                 trainable: bool = True,
                 dropout: float = None,
                 dropout_value: float = -1e20,
                 **kwargs) -> None:
        super(ScalarMixWithDropout, self).__init__()
        self.mixture_range = mixture_range
        mixture_size = mixture_range[1] - mixture_range[0]
        self.mixture_size = mixture_size
        self.do_layer_norm = do_layer_norm
        self.dropout = dropout

        if initial_scalar_parameters is None:
            initial_scalar_parameters = [0.0] * mixture_size
        elif len(initial_scalar_parameters) != mixture_size:
            raise ValueError("Length of initial_scalar_parameters {} differs "
                             "from mixture_size {}".format(
                initial_scalar_parameters, mixture_size))

        # self.scalar_parameters = ParameterList(
        #     [Parameter(torch.FloatTensor([initial_scalar_parameters[i]]),
        #                requires_grad=trainable) for i
        #      in range(mixture_size)])
        self.scalar_parameters = Parameter(torch.FloatTensor(initial_scalar_parameters), requires_grad=True)
        self.gamma = Parameter(torch.FloatTensor([1.0]), requires_grad=trainable)

        if self.dropout:
            dropout_mask = torch.zeros(len(self.scalar_parameters))
            dropout_fill = torch.empty(len(self.scalar_parameters)).fill_(dropout_value)
            self.register_buffer("dropout_mask", dropout_mask)
            self.register_buffer("dropout_fill", dropout_fill)

    def forward(self, tensors: List[torch.Tensor],  # pylint: disable=arguments-differ
                mask: torch.Tensor = None) -> torch.Tensor:
        """Compute a weighted average of the ``tensors``.  The input tensors an be any shape
        with at least two dimensions, but must all be the same shape.
        
        When ``do_layer_norm=True``, the ``mask`` is required input.  If the ``tensors`` are
        dimensioned  ``(dim_0, ..., dim_{n-1}, dim_n)``, then the ``mask`` is dimensioned
        ``(dim_0, ..., dim_{n-1})``, as in the typical case with ``tensors`` of shape
        ``(batch_size, timesteps, dim)`` and ``mask`` of shape ``(batch_size, timesteps)``.
        
        When ``do_layer_norm=False`` the ``mask`` is ignored.

        Args:
          tensors: List[torch.Tensor]: 
          # pylint: disable:  (Default value = arguments-differmask: torch.Tensor = None)

        Returns:

        """
        if len(tensors) != self.mixture_size:
            tensors = tensors[self.mixture_range[0]:self.mixture_range[1]]
        if len(tensors) != self.mixture_size:
            raise ValueError("{} tensors were passed, but the module was initialized to "
                             "mix {} tensors.".format(len(tensors), self.mixture_size))

        def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked):
            tensor_masked = tensor * broadcast_mask
            mean = torch.sum(tensor_masked) / num_elements_not_masked
            variance = torch.sum(((tensor_masked - mean) * broadcast_mask) ** 2) / num_elements_not_masked
            return (tensor - mean) / torch.sqrt(variance + 1E-12)

        weights = self.scalar_parameters

        if self.dropout:
            weights = torch.where(self.dropout_mask.uniform_() > self.dropout, weights, self.dropout_fill)

        normed_weights = torch.nn.functional.softmax(weights, dim=0)

        if not self.do_layer_norm:
            return self.gamma * torch.einsum('i,ijkl->jkl', normed_weights, tensors)
            # pieces = []
            # for weight, tensor in zip(normed_weights, tensors):
            #     pieces.append(weight * tensor)
            # return self.gamma * sum(pieces)
        else:
            normed_weights = torch.split(normed_weights, split_size_or_sections=1)
            mask_float = mask.float()
            broadcast_mask = mask_float.unsqueeze(-1)
            input_dim = tensors[0].size(-1)
            num_elements_not_masked = torch.sum(mask_float) * input_dim

            pieces = []
            for weight, tensor in zip(normed_weights, tensors):
                pieces.append(weight * _do_layer_norm(tensor,
                                                      broadcast_mask, num_elements_not_masked))
            return self.gamma * sum(pieces)


class ScalarMixWithDropoutBuilder(ConfigTracker, ScalarMixWithDropout):

    def __init__(self,
                 mixture_range: Tuple[int, int],
                 do_layer_norm: bool = False,
                 initial_scalar_parameters: List[float] = None,
                 trainable: bool = True,
                 dropout: float = None,
                 dropout_value: float = -1e20) -> None:
        super().__init__(locals())

    def build(self):
        return ScalarMixWithDropout(**self.config)


================================================
FILE: hanlp/layers/time_distributed.py
================================================
"""
A wrapper that unrolls the second (time) dimension of a tensor
into the first (batch) dimension, applies some other `Module`,
and then rolls the time dimension back up.
"""

from typing import List


import torch


class TimeDistributed(torch.nn.Module):
    """
    Given an input shaped like `(batch_size, time_steps, [rest])` and a `Module` that takes
    inputs like `(batch_size, [rest])`, `TimeDistributed` reshapes the input to be
    `(batch_size * time_steps, [rest])`, applies the contained `Module`, then reshapes it back.

    Note that while the above gives shapes with `batch_size` first, this `Module` also works if
    `batch_size` is second - we always just combine the first two dimensions, then split them.

    It also reshapes keyword arguments unless they are not tensors or their name is specified in
    the optional `pass_through` iterable.
    """

    def __init__(self, module):
        super().__init__()
        self._module = module


    def forward(self, *inputs, pass_through: List[str] = None, **kwargs):

        pass_through = pass_through or []

        reshaped_inputs = [self._reshape_tensor(input_tensor) for input_tensor in inputs]

        # Need some input to then get the batch_size and time_steps.
        some_input = None
        if inputs:
            some_input = inputs[-1]

        reshaped_kwargs = {}
        for key, value in kwargs.items():
            if isinstance(value, torch.Tensor) and key not in pass_through:
                if some_input is None:
                    some_input = value

                value = self._reshape_tensor(value)

            reshaped_kwargs[key] = value

        reshaped_outputs = self._module(*reshaped_inputs, **reshaped_kwargs)

        if some_input is None:
            raise RuntimeError("No input tensor to time-distribute")

        # Now get the output back into the right shape.
        # (batch_size, time_steps, **output_size)
        new_size = some_input.size()[:2] + reshaped_outputs.size()[1:]
        outputs = reshaped_outputs.contiguous().view(new_size)

        return outputs

    @staticmethod
    def _reshape_tensor(input_tensor):
        input_size = input_tensor.size()
        if len(input_size) <= 2:
            raise RuntimeError(f"No dimension to distribute: {input_size}")
        # Squash batch_size and time_steps into a single axis; result has shape
        # (batch_size * time_steps, **input_size).
        squashed_shape = [-1] + list(input_size[2:])
        return input_tensor.contiguous().view(*squashed_shape)


================================================
FILE: hanlp/layers/transformers/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 15:17
# mute transformers
import logging

logging.getLogger('transformers.file_utils').setLevel(logging.ERROR)
logging.getLogger('transformers.filelock').setLevel(logging.ERROR)
logging.getLogger('transformers.tokenization_utils').setLevel(logging.ERROR)
logging.getLogger('transformers.configuration_utils').setLevel(logging.ERROR)
logging.getLogger('transformers.modeling_tf_utils').setLevel(logging.ERROR)
logging.getLogger('transformers.modeling_utils').setLevel(logging.ERROR)
logging.getLogger('transformers.tokenization_utils_base').setLevel(logging.ERROR)


================================================
FILE: hanlp/layers/transformers/encoder.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-22 21:06
import warnings
from typing import Union, Dict, Any, Sequence, Tuple, Optional

import torch
from torch import nn
from hanlp.layers.dropout import WordDropout
from hanlp.layers.scalar_mix import ScalarMixWithDropout, ScalarMixWithDropoutBuilder
from hanlp.layers.transformers.resource import get_tokenizer_mirror
from hanlp.layers.transformers.pt_imports import PreTrainedModel, PreTrainedTokenizer, AutoTokenizer, AutoModel_, \
    BertTokenizer, AutoTokenizer_
from hanlp.layers.transformers.utils import transformer_encode


# noinspection PyAbstractClass
class TransformerEncoder(nn.Module):
    def __init__(self,
                 transformer: Union[PreTrainedModel, str],
                 transformer_tokenizer: PreTrainedTokenizer,
                 average_subwords=False,
                 scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None,
                 word_dropout=None,
                 max_sequence_length=None,
                 ret_raw_hidden_states=False,
                 transformer_args: Dict[str, Any] = None,
                 trainable=Union[bool, Optional[Tuple[int, int]]],
                 training=True) -> None:
        """A pre-trained transformer encoder.

        Args:
            transformer: A ``PreTrainedModel`` or an identifier of a ``PreTrainedModel``.
            transformer_tokenizer: A ``PreTrainedTokenizer``.
            average_subwords: ``True`` to average subword representations.
            scalar_mix: Layer attention.
            word_dropout: Dropout rate of randomly replacing a subword with MASK.
            max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding
                window. If ``None``, then the ``max_position_embeddings`` of the transformer will be used.
            ret_raw_hidden_states: ``True`` to return hidden states of each layer.
            transformer_args: Extra arguments passed to the transformer.
            trainable: ``False`` to use static embeddings.
            training: ``False`` to skip loading weights from pre-trained transformers.
        """
        super().__init__()
        self.ret_raw_hidden_states = ret_raw_hidden_states
        self.average_subwords = average_subwords
        if word_dropout:
            oov = transformer_tokenizer.mask_token_id
            if isinstance(word_dropout, Sequence):
                word_dropout, replacement = word_dropout
                if replacement == 'unk':
                    # Electra English has to use unk
                    oov = transformer_tokenizer.unk_token_id
                elif replacement == 'mask':
                    # UDify uses [MASK]
                    oov = transformer_tokenizer.mask_token_id
                else:
                    oov = replacement
            pad = transformer_tokenizer.pad_token_id
            cls = transformer_tokenizer.cls_token_id
            sep = transformer_tokenizer.sep_token_id
            excludes = [pad, cls, sep]
            self.word_dropout = WordDropout(p=word_dropout, oov_token=oov, exclude_tokens=excludes)
        else:
            self.word_dropout = None
        if isinstance(transformer, str):
            output_hidden_states = scalar_mix is not None
            if transformer_args is None:
                transformer_args = dict()
            transformer_args['output_hidden_states'] = output_hidden_states
            transformer = AutoModel_.from_pretrained(transformer, training=training or not trainable,
                                                     **transformer_args)
            if max_sequence_length is None:
                max_sequence_length = transformer.config.max_position_embeddings
        self.max_sequence_length = max_sequence_length
        if hasattr(transformer, 'encoder') and hasattr(transformer, 'decoder'):
            # For seq2seq model, use its encoder
            transformer = transformer.encoder
        self.transformer = transformer
        if not trainable:
            transformer.requires_grad_(False)
        elif isinstance(trainable, tuple):
            layers = []
            if hasattr(transformer, 'embeddings'):
                layers.append(transformer.embeddings)
            layers.extend(transformer.encoder.layer)
            for i, layer in enumerate(layers):
                if i < trainable[0] or i >= trainable[1]:
                    layer.requires_grad_(False)

        if isinstance(scalar_mix, ScalarMixWithDropoutBuilder):
            self.scalar_mix: ScalarMixWithDropout = scalar_mix.build()
        else:
            self.scalar_mix = None

    def forward(self, input_ids: torch.LongTensor, attention_mask=None, token_type_ids=None, token_span=None, **kwargs):
        if self.word_dropout:
            input_ids = self.word_dropout(input_ids)

        x = transformer_encode(self.transformer,
                               input_ids,
                               attention_mask,
                               token_type_ids,
                               token_span,
                               layer_range=self.scalar_mix.mixture_range if self.scalar_mix else 0,
                               max_sequence_length=self.max_sequence_length,
                               average_subwords=self.average_subwords,
                               ret_raw_hidden_states=self.ret_raw_hidden_states)
        if self.ret_raw_hidden_states:
            x, raw_hidden_states = x
        if self.scalar_mix:
            x = self.scalar_mix(x)
        if self.ret_raw_hidden_states:
            # noinspection PyUnboundLocalVariable
            return x, raw_hidden_states
        return x

    @staticmethod
    def build_transformer(config, training=True) -> PreTrainedModel:
        kwargs = {}
        if config.scalar_mix and config.scalar_mix > 0:
            kwargs['output_hidden_states'] = True
        transformer = AutoModel_.from_pretrained(config.transformer, training=training, **kwargs)
        return transformer

    @staticmethod
    def build_transformer_tokenizer(config_or_str, use_fast=True, do_basic_tokenize=True) -> PreTrainedTokenizer:
        return AutoTokenizer_.from_pretrained(config_or_str, use_fast, do_basic_tokenize)


================================================
FILE: hanlp/layers/transformers/loader_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-04 06:05
import tensorflow as tf
from transformers import TFAutoModel

from hanlp.layers.transformers.pt_imports import AutoTokenizer_, AutoModel_


def build_transformer(transformer, max_seq_length, num_labels, tagging=True, tokenizer_only=False):
    tokenizer = AutoTokenizer_.from_pretrained(transformer)
    if tokenizer_only:
        return tokenizer
    l_bert = TFAutoModel.from_pretrained(transformer)
    l_input_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="input_ids")
    l_mask_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="mask_ids")
    l_token_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="token_type_ids")
    output = l_bert(input_ids=l_input_ids, token_type_ids=l_token_type_ids, attention_mask=l_mask_ids).last_hidden_state
    if not tagging:
        output = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
    logits = tf.keras.layers.Dense(num_labels)(output)
    model = tf.keras.Model(inputs=[l_input_ids, l_mask_ids, l_token_type_ids], outputs=logits)
    model.build(input_shape=(None, max_seq_length))
    return model, tokenizer


================================================
FILE: hanlp/layers/transformers/pt_imports.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-09 11:25
import os
import warnings

from hanlp.layers.transformers.resource import get_tokenizer_mirror, get_model_mirror

if os.environ.get('TOKENIZERS_PARALLELISM', None) is None:
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
from transformers import BertTokenizer, BertConfig, PretrainedConfig, AutoConfig, AutoTokenizer, PreTrainedTokenizer, \
    BertTokenizerFast, AlbertConfig, BertModel, AutoModel, PreTrainedModel, AutoModelForSequenceClassification, \
    AutoModelForTokenClassification, BartModel


class AutoModel_(AutoModel):
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, training=True, **kwargs):
        pretrained_model_name_or_path = get_model_mirror(pretrained_model_name_or_path)
        if training:
            return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        else:
            if isinstance(pretrained_model_name_or_path, str):
                pretrained_model_name_or_path = get_tokenizer_mirror(pretrained_model_name_or_path)
                return super().from_config(AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs))
            else:
                assert not kwargs
                return super().from_config(pretrained_model_name_or_path)


class AutoConfig_(AutoConfig):
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        pretrained_model_name_or_path = get_tokenizer_mirror(pretrained_model_name_or_path)
        return super().from_pretrained(pretrained_model_name_or_path, **kwargs)


class AutoTokenizer_(AutoTokenizer):

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True,
                        do_basic_tokenize=True) -> PreTrainedTokenizer:
        if isinstance(pretrained_model_name_or_path, str):
            transformer = pretrained_model_name_or_path
        else:
            transformer = pretrained_model_name_or_path.transformer
        additional_config = dict()
        if transformer.startswith('voidful/albert_chinese_') or transformer.startswith('uer/albert'):
            cls = BertTokenizer
        elif transformer == 'cl-tohoku/bert-base-japanese-char':
            # Since it's char level model, it's OK to use char level tok instead of fugashi
            # from hanlp.utils.lang.ja.bert_tok import BertJapaneseTokenizerFast
            # cls = BertJapaneseTokenizerFast
            from transformers import BertJapaneseTokenizer
            cls = BertJapaneseTokenizer
            # from transformers import BertTokenizerFast
            # cls = BertTokenizerFast
            additional_config['word_tokenizer_type'] = 'basic'
        elif transformer == "Langboat/mengzi-bert-base":
            cls = BertTokenizerFast if use_fast else BertTokenizer
        else:
            cls = AutoTokenizer
        if use_fast and not do_basic_tokenize:
            warnings.warn('`do_basic_tokenize=False` might not work when `use_fast=True`')
        tokenizer = cls.from_pretrained(get_tokenizer_mirror(transformer), use_fast=use_fast,
                                        do_basic_tokenize=do_basic_tokenize,
                                        **additional_config)
        tokenizer.name_or_path = transformer
        return tokenizer


================================================
FILE: hanlp/layers/transformers/relative_transformer.py
================================================
# A modified version of the implementation from the following paper:
# TENER: Adapting Transformer Encoder for Named Entity Recognition
# Hang Yan, Bocao Deng, Xiaonan Li, Xipeng Qiu

import math
import torch
import torch.nn.functional as F
from torch import Tensor, nn

from hanlp.common.structure import ConfigTracker


class RelativeSinusoidalPositionalEmbedding(nn.Module):
    """This module produces sinusoidal positional embeddings of any length.
    Padding symbols are ignored.

    Args:
        embedding_dim: embedding size of each position
        padding_idx:
    Returns:

    """

    def __init__(self, embedding_dim, padding_idx, init_size=1024):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.padding_idx = padding_idx
        assert init_size % 2 == 0
        weights = self.get_embedding(
            init_size + 1,
            embedding_dim,
            padding_idx,
        )
        self.register_buffer('weights', weights)

    def get_embedding(self, num_embeddings, embedding_dim, padding_idx=None):
        """Build sinusoidal embeddings.
        This matches the implementation in tensor2tensor, but differs slightly
        from the description in Section 3.5 of "Attention Is All You Need".

        Args:
          num_embeddings:
          embedding_dim:
          padding_idx:  (Default value = None)

        Returns:

        """
        half_dim = embedding_dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
        emb = torch.arange(-num_embeddings // 2, num_embeddings // 2, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
        if embedding_dim % 2 == 1:
            # zero pad
            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
        if padding_idx is not None:
            emb[padding_idx, :] = 0
        self.origin_shift = num_embeddings // 2 + 1
        return emb

    def forward(self, inputs: Tensor):
        """Input is expected to be of size [bsz x seqlen].

        Args:
          inputs: Tensor:

        Returns:

        """
        bsz, seq_len = inputs.size()
        max_pos = self.padding_idx + seq_len
        if max_pos >= self.origin_shift:
            # recompute/expand embeddings if needed
            weights = self.get_embedding(
                max_pos * 2,
                self.embedding_dim,
                self.padding_idx,
            )
            weights = weights.to(self.weights.device)
            del self.weights
            self.origin_shift = weights.size(0) // 2
            self.register_buffer('weights', weights)

        positions = torch.arange(-seq_len, seq_len).to(inputs.device).long() + self.origin_shift  # 2*seq_len
        embed = self.weights.index_select(0, positions.long()).detach()
        return embed


class RelativeMultiHeadAttn(nn.Module):
    def __init__(self, in_features, num_heads, dropout, r_w_bias=None, r_r_bias=None, init_seq_length=1024,
                 k_as_x=True):
        """
        Args:
            in_features:
            num_heads:
            dropout:
            r_w_bias: n_head x head_dim or None
            r_r_bias: n_head x head_dim or None
            init_seq_length:
            k_as_x:
        """
        super().__init__()
        self.k_as_x = k_as_x
        if k_as_x:
            self.qv_linear = nn.Linear(in_features, in_features * 2, bias=False)
        else:
            self.qkv_linear = nn.Linear(in_features, in_features * 3, bias=False)
        self.n_head = num_heads
        self.head_dim = in_features // num_heads
        self.dropout_layer = nn.Dropout(dropout)
        self.pos_embed = RelativeSinusoidalPositionalEmbedding(self.head_dim, 0, init_seq_length)
        if r_r_bias is None or r_w_bias is None:  # Biases are not shared
            self.r_r_bias = nn.Parameter(nn.init.xavier_normal_(torch.zeros(num_heads, in_features // num_heads)))
            self.r_w_bias = nn.Parameter(nn.init.xavier_normal_(torch.zeros(num_heads, in_features // num_heads)))
        else:
            self.r_r_bias = r_r_bias  # r_r_bias就是v
            self.r_w_bias = r_w_bias  # r_w_bias就是u

    def forward(self, x, mask):
        """

        Args:
          x: batch_size x max_len x d_model
          mask: batch_size x max_len

        Returns:

        """

        batch_size, max_len, d_model = x.size()
        pos_embed = self.pos_embed(mask)  # l x head_dim

        if self.k_as_x:
            qv = self.qv_linear(x)  # batch_size x max_len x d_model2
            q, v = torch.chunk(qv, chunks=2, dim=-1)
            k = x.view(batch_size, max_len, self.n_head, -1).transpose(1, 2)
        else:
            qkv = self.qkv_linear(x)  # batch_size x max_len x d_model3
            q, k, v = torch.chunk(qkv, chunks=3, dim=-1)
            k = k.view(batch_size, max_len, self.n_head, -1).transpose(1, 2)

        q = q.view(batch_size, max_len, self.n_head, -1).transpose(1, 2)
        v = v.view(batch_size, max_len, self.n_head, -1).transpose(1, 2)  # b x n x l x d

        rw_head_q = q + self.r_r_bias[:, None]
        AC = torch.einsum('bnqd,bnkd->bnqk', [rw_head_q, k])  # b x n x l x d, n是head

        D_ = torch.einsum('nd,ld->nl', self.r_w_bias, pos_embed)[None, :, None]  # head x 2max_len, 每个head对位置的bias
        B_ = torch.einsum('bnqd,ld->bnql', q, pos_embed)  # bsz x head  x max_len x 2max_len，每个query对每个shift的偏移
        E_ = torch.einsum('bnqd,ld->bnql', k, pos_embed)  # bsz x head x max_len x 2max_len, key对relative的bias
        BD = B_ + D_  # bsz x head x max_len x 2max_len, 要转换为bsz x head x max_len x max_len
        if self.k_as_x:
            BD = self._shift(BD)
            attn = AC + BD
        else:
            BDE = self._shift(BD) + self._transpose_shift(E_)
            attn = AC + BDE

        attn = attn.masked_fill(mask[:, None, None, :].eq(0), float('-inf'))

        attn = F.softmax(attn, dim=-1)
        attn = self.dropout_layer(attn)
        v = torch.matmul(attn, v).transpose(1, 2).reshape(batch_size, max_len, d_model)  # b x n x l x d

        return v

    def _shift(self, BD):
        """类似
        -3 -2 -1 0 1 2
        -3 -2 -1 0 1 2
        -3 -2 -1 0 1 2
        转换为
        0   1  2
        -1  0  1
        -2 -1  0

        Args:
          BD: batch_size x n_head x max_len x 2max_len

        Returns:
          batch_size x n_head x max_len x max_len

        """
        bsz, n_head, max_len, _ = BD.size()
        zero_pad = BD.new_zeros(bsz, n_head, max_len, 1)
        BD = torch.cat([BD, zero_pad], dim=-1).view(bsz, n_head, -1, max_len)  # bsz x n_head x (2max_len+1) x max_len
        BD = BD.narrow(dim=2, start=0, length=2 * max_len) \
            .view(bsz, n_head, max_len, -1)  # bsz x n_head x 2max_len x max_len
        BD = BD.narrow(dim=-1, start=max_len, length=max_len)
        return BD

    def _transpose_shift(self, E):
        """类似
          -3   -2   -1   0   1   2
         -30  -20  -10  00  10  20
        -300 -200 -100 000 100 200

        转换为
          0  -10   -200
          1   00   -100
          2   10    000

        Args:
          E: batch_size x n_head x max_len x 2max_len

        Returns:
          batch_size x n_head x max_len x max_len

        """
        bsz, n_head, max_len, _ = E.size()
        zero_pad = E.new_zeros(bsz, n_head, max_len, 1)
        # bsz x n_head x -1 x (max_len+1)
        E = torch.cat([E, zero_pad], dim=-1).view(bsz, n_head, -1, max_len)
        indice = (torch.arange(max_len) * 2 + 1).to(E.device)
        E = E.index_select(index=indice, dim=-2).transpose(-1, -2)  # bsz x n_head x max_len x max_len

        return E


class RelativeTransformerLayer(nn.Module):
    def __init__(self,
                 in_features,
                 num_heads=4,
                 feedforward_dim=256,
                 dropout=0.2,
                 dropout_attn=None,
                 after_norm=True,
                 k_as_x=True,
                 init_seq_length=1024):
        super().__init__()
        if dropout_attn is None:
            dropout_attn = dropout
        self.after_norm = after_norm
        self.norm1 = nn.LayerNorm(in_features)
        self.norm2 = nn.LayerNorm(in_features)
        self.self_attn = RelativeMultiHeadAttn(in_features,
                                               num_heads,
                                               dropout=dropout_attn,
                                               init_seq_length=init_seq_length,
                                               k_as_x=k_as_x)
        self.ffn = nn.Sequential(nn.Linear(in_features, feedforward_dim),
                                 nn.LeakyReLU(),
                                 nn.Dropout(dropout, inplace=True),
                                 nn.Linear(feedforward_dim, in_features),
                                 nn.Dropout(dropout, inplace=True))

    def forward(self, x, mask):
        """

        Args:
          x: batch_size x max_len x hidden_size
          mask: batch_size x max_len, 为0的地方为pad

        Returns:
          batch_size x max_len x hidden_size

        """
        residual = x
        if not self.after_norm:
            x = self.norm1(x)

        x = self.self_attn(x, mask)
        x = x + residual
        if self.after_norm:
            x = self.norm1(x)
        residual = x
        if not self.after_norm:
            x = self.norm2(x)
        x = self.ffn(x)
        x = residual + x
        if self.after_norm:
            x = self.norm2(x)
        return x


class RelativeTransformer(nn.Module):
    def __init__(self,
                 in_features,
                 num_layers,
                 feedforward_dim,
                 num_heads,
                 dropout,
                 dropout_attn=None,
                 after_norm=True,
                 init_seq_length=1024,
                 k_as_x=True):
        super().__init__()
        self.layers = nn.ModuleList([
            RelativeTransformerLayer(in_features, feedforward_dim, num_heads, dropout, dropout_attn, after_norm,
                                     init_seq_length=init_seq_length, k_as_x=k_as_x)
            for _ in range(num_layers)
        ])

    def forward(self, x: Tensor, mask: Tensor):
        """

        Args:
          x: batch_size x max_len
          mask: batch_size x max_len. 有value的地方为1
          x: Tensor: 
          mask: Tensor: 

        Returns:

        """
        if not x.numel():
            return x
        for layer in self.layers:
            x = layer(x, mask)
        return x


class RelativeTransformerEncoder(RelativeTransformer, ConfigTracker):
    def __init__(self,
                 in_features,
                 num_layers=2,
                 num_heads=4,
                 feedforward_dim=256,
                 dropout=0.1,
                 dropout_attn=0.1,
                 after_norm=True,
                 k_as_x=True,
                 ):
        super().__init__(in_features, num_layers, num_heads, feedforward_dim, dropout, dropout_attn, after_norm)
        ConfigTracker.__init__(self, locals())

    def get_output_dim(self):
        return self.config['in_features']


================================================
FILE: hanlp/layers/transformers/resource.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-20 12:43
from hanlp.utils.io_util import get_resource
from hanlp_common.constant import HANLP_URL

tokenizer_mirrors = {
    'hfl/chinese-electra-180g-base-discriminator': HANLP_URL + 'transformers/electra_zh_base_20210706_125233.zip',
    'hfl/chinese-electra-180g-small-discriminator': HANLP_URL + 'transformers/electra_zh_small_20210706_125427.zip',
    'xlm-roberta-base': HANLP_URL + 'transformers/xlm-roberta-base_20210706_125502.zip',
    'cl-tohoku/bert-base-japanese-char': HANLP_URL + 'transformers/bert-base-japanese-char_20210602_215445.zip',
    'bart5-chinese-small': HANLP_URL + 'transformers/bart5-chinese-small_tok_20210723_180743.zip',
    'ernie-gram': HANLP_URL + 'transformers/ernie-gram_20220207_103518.zip',
    'xlm-roberta-base-no-space': HANLP_URL + 'transformers/xlm-roberta-base-no-space-tokenizer_20220610_204241.zip',
    'mMiniLMv2L6-no-space': HANLP_URL + 'transformers/mMiniLMv2L6-no-space-tokenizer_20220616_094859.zip',
    'mMiniLMv2L12-no-space': HANLP_URL + 'transformers/mMiniLMv2L12-no-space-tokenizer_20220616_095900.zip',
}

model_mirrors = {
    'bart5-chinese-small': HANLP_URL + 'transformers/bart5-chinese-small_20210723_203923.zip',
    'xlm-roberta-base-no-space': HANLP_URL + 'transformers/xlm-roberta-base-no-space_20220610_203944.zip',
    'mMiniLMv2L6-no-space': HANLP_URL + 'transformers/mMiniLMv2L6-no-space_20220616_094949.zip',
    'mMiniLMv2L12-no-space': HANLP_URL + 'transformers/mMiniLMv2L12-no-space_20220616_095924.zip',
}


def get_tokenizer_mirror(transformer: str) -> str:
    m = tokenizer_mirrors.get(transformer, None)
    if m:
        return get_resource(m)
    return transformer


def get_model_mirror(transformer: str) -> str:
    m = model_mirrors.get(transformer, None)
    if m:
        return get_resource(m)
    return transformer


================================================
FILE: hanlp/layers/transformers/tf_imports.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-08 21:57
from transformers import BertTokenizer, BertConfig, PretrainedConfig, TFAutoModel, \
    AutoConfig, AutoTokenizer, PreTrainedTokenizer, TFPreTrainedModel, TFAlbertModel, TFAutoModelWithLMHead, \
    BertTokenizerFast, TFAlbertForMaskedLM, AlbertConfig, TFBertModel


================================================
FILE: hanlp/layers/transformers/utils.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-15 21:22
from collections import defaultdict
from typing import Tuple, Union

import torch
from torch.nn import functional as F

from hanlp.components.parsers.ud import udify_util as util
from hanlp.layers.transformers.pt_imports import PreTrainedModel


def transformer_encode(transformer: PreTrainedModel,
                       input_ids,
                       attention_mask=None,
                       token_type_ids=None,
                       token_span=None,
                       layer_range: Union[int, Tuple[int, int]] = 0,
                       max_sequence_length=None,
                       average_subwords=False,
                       ret_raw_hidden_states=False):
    """Run transformer and pool its outputs.

    Args:
        transformer: A transformer model.
        input_ids: Indices of subwords.
        attention_mask: Mask for these subwords.
        token_type_ids: Type ids for each subword.
        token_span: The spans of tokens.
        layer_range: The range of layers to use. Note that the 0-th layer means embedding layer, so the last 3 layers
                    of a 12-layer BERT will be (10, 13).
        max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding
                    window.
         average_subwords: ``True`` to average subword representations.
        ret_raw_hidden_states: ``True`` to return hidden states of each layer.

    Returns:
        Pooled outputs.

    """
    if max_sequence_length and input_ids.size(-1) > max_sequence_length:
        # TODO: split token type ids in transformer_sliding_window if token type ids are not always 1
        outputs = transformer_sliding_window(transformer, input_ids, max_pieces=max_sequence_length)
    else:
        if attention_mask is None:
            attention_mask = input_ids.ne(0)
        if transformer.config.output_hidden_states:
            outputs = transformer(input_ids, attention_mask, token_type_ids)[-1]
        else:
            outputs = transformer(input_ids, attention_mask, token_type_ids)[0]
    if transformer.config.output_hidden_states:
        if isinstance(layer_range, int):
            outputs = outputs[layer_range:]
        else:
            outputs = outputs[layer_range[0], layer_range[1]]
        # Slow pick
        # hs = []
        # for h in outputs:
        #     hs.append(pick_tensor_for_each_token(h, token_span, average_subwords))
        # Fast pick
        if not isinstance(outputs, torch.Tensor):
            x = torch.stack(outputs)
        else:
            x = outputs
        L, B, T, F = x.size()
        x = x.flatten(end_dim=1)
        # tile token_span as x
        if token_span is not None:
            token_span = token_span.repeat(L, 1, 1)
        hs = pick_tensor_for_each_token(x, token_span, average_subwords).view(L, B, -1, F)
        if ret_raw_hidden_states:
            return hs, outputs
        return hs
    else:
        if ret_raw_hidden_states:
            return pick_tensor_for_each_token(outputs, token_span, average_subwords), outputs
        return pick_tensor_for_each_token(outputs, token_span, average_subwords)


def pick_tensor_for_each_token(h, token_span, average_subwords):
    if token_span is None:
        return h
    if average_subwords and token_span.size(-1) > 1:
        batch_size = h.size(0)
        h_span = h.gather(1, token_span.view(batch_size, -1).unsqueeze(-1).expand(-1, -1, h.shape[-1]))
        h_span = h_span.view(batch_size, *token_span.shape[1:], -1)
        n_sub_tokens = token_span.ne(0)
        n_sub_tokens[:, 0, 0] = True
        h_span = (h_span * n_sub_tokens.unsqueeze(-1)).sum(2)
        n_sub_tokens = n_sub_tokens.sum(-1).unsqueeze(-1)
        zero_mask = n_sub_tokens == 0
        if torch.any(zero_mask):
            n_sub_tokens[zero_mask] = 1  # avoid dividing by zero
        embed = h_span / n_sub_tokens
    else:
        embed = h.gather(1, token_span[:, :, 0].unsqueeze(-1).expand(-1, -1, h.size(-1)))
    return embed


def transformer_sliding_window(transformer: PreTrainedModel,
                               input_ids: torch.LongTensor,
                               input_mask=None,
                               offsets: torch.LongTensor = None,
                               token_type_ids: torch.LongTensor = None,
                               max_pieces=512,
                               start_tokens: int = 1,
                               end_tokens: int = 1,
                               ret_cls=None,
                               ) -> torch.Tensor:
    """

    Args:
      transformer:
      input_ids: torch.LongTensor: 
      input_mask:  (Default value = None)
      offsets: torch.LongTensor:  (Default value = None)
      token_type_ids: torch.LongTensor:  (Default value = None)
      max_pieces:  (Default value = 512)
      start_tokens: int:  (Default value = 1)
      end_tokens: int:  (Default value = 1)
      ret_cls:  (Default value = None)

    Returns:

    
    """
    # pylint: disable=arguments-differ
    batch_size, full_seq_len = input_ids.size(0), input_ids.size(-1)
    initial_dims = list(input_ids.shape[:-1])

    # The embedder may receive an input tensor that has a sequence length longer than can
    # be fit. In that case, we should expect the wordpiece indexer to create padded windows
    # of length `max_pieces` for us, and have them concatenated into one long sequence.
    # E.g., "[CLS] I went to the [SEP] [CLS] to the store to [SEP] ..."
    # We can then split the sequence into sub-sequences of that length, and concatenate them
    # along the batch dimension so we effectively have one huge batch of partial sentences.
    # This can then be fed into BERT without any sentence length issues. Keep in mind
    # that the memory consumption can dramatically increase for large batches with extremely
    # long sentences.
    needs_split = full_seq_len > max_pieces
    if needs_split:
        input_ids = split_to_sliding_window(input_ids, max_pieces)

    # if token_type_ids is None:
    #     token_type_ids = torch.zeros_like(input_ids)
    if input_mask is None:
        input_mask = (input_ids != 0).long()

    # input_ids may have extra dimensions, so we reshape down to 2-d
    # before calling the BERT model and then reshape back at the end.
    outputs = transformer(input_ids=util.combine_initial_dims_to_1d_or_2d(input_ids),
                          # token_type_ids=util.combine_initial_dims_to_1d_or_2d(token_type_ids),
                          attention_mask=util.combine_initial_dims_to_1d_or_2d(input_mask)).to_tuple()
    if len(outputs) == 3:
        all_encoder_layers = outputs.hidden_states
        all_encoder_layers = torch.stack(all_encoder_layers)
    elif len(outputs) == 2:
        all_encoder_layers, _ = outputs[:2]
    else:
        all_encoder_layers = outputs[0]

    if needs_split:
        if ret_cls is not None:
            cls_mask = input_ids[:, 0] == input_ids[0][0]
            cls_hidden = all_encoder_layers[:, 0, :]
            if ret_cls == 'max':
                cls_hidden[~cls_mask] = -1e20
            else:
                cls_hidden[~cls_mask] = 0
            cls_mask = cls_mask.view(-1, batch_size).transpose(0, 1)
            cls_hidden = cls_hidden.reshape(cls_mask.size(1), batch_size, -1).transpose(0, 1)
            if ret_cls == 'max':
                cls_hidden = cls_hidden.max(1)[0]
            elif ret_cls == 'raw':
                return cls_hidden, cls_mask
            else:
                cls_hidden = torch.sum(cls_hidden, dim=1)
                cls_hidden /= torch.sum(cls_mask, dim=1, keepdim=True)
            return cls_hidden
        else:
            recombined_embeddings, select_indices = restore_from_sliding_window(all_encoder_layers, batch_size,
                                                                                max_pieces, full_seq_len, start_tokens,
                                                                                end_tokens)

            initial_dims.append(len(select_indices))
    else:
        recombined_embeddings = all_encoder_layers

    # Recombine the outputs of all layers
    # (layers, batch_size * d1 * ... * dn, sequence_length, embedding_dim)
    # recombined = torch.cat(combined, dim=2)
    # input_mask = (recombined_embeddings != 0).long()

    # At this point, mix is (batch_size * d1 * ... * dn, sequence_length, embedding_dim)

    if offsets is None:
        # Resize to (batch_size, d1, ..., dn, sequence_length, embedding_dim)
        dims = initial_dims if needs_split else input_ids.size()
        layers = util.uncombine_initial_dims(recombined_embeddings, dims)
    else:
        # offsets is (batch_size, d1, ..., dn, orig_sequence_length)
        offsets2d = util.combine_initial_dims_to_1d_or_2d(offsets)
        # now offsets is (batch_size * d1 * ... * dn, orig_sequence_length)
        range_vector = util.get_range_vector(offsets2d.size(0),
                                             device=util.get_device_of(recombined_embeddings)).unsqueeze(1)
        # selected embeddings is also (batch_size * d1 * ... * dn, orig_sequence_length)
        selected_embeddings = recombined_embeddings[:, range_vector, offsets2d]

        layers = util.uncombine_initial_dims(selected_embeddings, offsets.size())

    return layers


def split_to_sliding_window(input_ids, max_pieces):
    # Split the flattened list by the window size, `max_pieces`
    split_input_ids = list(input_ids.split(max_pieces, dim=-1))
    # We want all sequences to be the same length, so pad the last sequence
    last_window_size = split_input_ids[-1].size(-1)
    padding_amount = max_pieces - last_window_size
    split_input_ids[-1] = F.pad(split_input_ids[-1], pad=[0, padding_amount], value=0)
    # Now combine the sequences along the batch dimension
    input_ids = torch.cat(split_input_ids, dim=0)
    return input_ids


def restore_from_sliding_window(all_encoder_layers, batch_size, max_pieces, full_seq_len, start_tokens, end_tokens):
    # First, unpack the output embeddings into one long sequence again
    unpacked_embeddings = torch.split(all_encoder_layers, batch_size, dim=-3)
    unpacked_embeddings = torch.cat(unpacked_embeddings, dim=-2)
    # Next, select indices of the sequence such that it will result in embeddings representing the original
    # sentence. To capture maximal context, the indices will be the middle part of each embedded window
    # sub-sequence (plus any leftover start and final edge windows), e.g.,
    #  0     1 2    3  4   5    6    7     8     9   10   11   12    13 14  15
    # "[CLS] I went to the very fine [SEP] [CLS] the very fine store to eat [SEP]"
    # with max_pieces = 8 should produce max context indices [2, 3, 4, 10, 11, 12] with additional start
    # and final windows with indices [0, 1] and [14, 15] respectively.
    # Find the stride as half the max pieces, ignoring the special start and end tokens
    # Calculate an offset to extract the centermost embeddings of each window
    stride = (max_pieces - start_tokens - end_tokens) // 2
    stride_offset = stride // 2 + start_tokens
    first_window = list(range(stride_offset))
    max_context_windows = [i for i in range(full_seq_len)
                           if stride_offset - 1 < i % max_pieces < stride_offset + stride]
    final_window_start = max_context_windows[-1] + 1
    final_window = list(range(final_window_start, full_seq_len))
    select_indices = first_window + max_context_windows + final_window
    select_indices = torch.LongTensor(select_indices).to(unpacked_embeddings.device)
    recombined_embeddings = unpacked_embeddings.index_select(-2, select_indices)
    return recombined_embeddings, select_indices


def build_optimizer_for_pretrained(model: torch.nn.Module,
                                   pretrained: torch.nn.Module,
                                   lr=1e-5,
                                   weight_decay=0.01,
                                   eps=1e-8,
                                   transformer_lr=None,
                                   transformer_weight_decay=None,
                                   no_decay=('bias', 'LayerNorm.bias', 'LayerNorm.weight'),
                                   **kwargs):
    if transformer_lr is None:
        transformer_lr = lr
    if transformer_weight_decay is None:
        transformer_weight_decay = weight_decay
    params = defaultdict(lambda: defaultdict(list))
    pretrained = set(pretrained.parameters())
    if isinstance(no_decay, tuple):
        def no_decay_fn(name):
            return any(nd in name for nd in no_decay)
    else:
        assert callable(no_decay), 'no_decay has to be callable or a tuple of str'
        no_decay_fn = no_decay
    for n, p in model.named_parameters():
        is_pretrained = 'pretrained' if p in pretrained else 'non_pretrained'
        is_no_decay = 'no_decay' if no_decay_fn(n) else 'decay'
        params[is_pretrained][is_no_decay].append(p)

    grouped_parameters = [
        {'params': params['pretrained']['decay'], 'weight_decay': transformer_weight_decay, 'lr': transformer_lr},
        {'params': params['pretrained']['no_decay'], 'weight_decay': 0.0, 'lr': transformer_lr},
        {'params': params['non_pretrained']['decay'], 'weight_decay': weight_decay, 'lr': lr},
        {'params': params['non_pretrained']['no_decay'], 'weight_decay': 0.0, 'lr': lr},
    ]

    from transformers import optimization
    return optimization.AdamW(
        grouped_parameters,
        lr=lr,
        weight_decay=weight_decay,
        eps=eps,
        no_deprecation_warning=True,  # For backwards compatability
        **kwargs)


def build_optimizer_scheduler_with_transformer(model: torch.nn.Module,
                                               transformer: torch.nn.Module,
                                               lr: float,
                                               transformer_lr: float,
                                               num_training_steps: int,
                                               warmup_steps: Union[float, int],
                                               weight_decay: float,
                                               adam_epsilon: float,
                                               no_decay=('bias', 'LayerNorm.bias', 'LayerNorm.weight')):
    optimizer = build_optimizer_for_pretrained(model,
                                               transformer,
                                               lr,
                                               weight_decay,
                                               eps=adam_epsilon,
                                               transformer_lr=transformer_lr,
                                               no_decay=no_decay)
    if isinstance(warmup_steps, float):
        assert 0 < warmup_steps < 1, 'warmup_steps has to fall in range (0, 1) when it is float.'
        warmup_steps = num_training_steps * warmup_steps
    from transformers import optimization
    scheduler = optimization.get_linear_schedule_with_warmup(optimizer, warmup_steps, num_training_steps)
    return optimizer, scheduler


def get_optimizers(
        model: torch.nn.Module,
        num_training_steps: int,
        learning_rate=5e-5,
        adam_epsilon=1e-8,
        weight_decay=0.0,
        warmup_steps=0.1,
) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]:
    """
    Modified from https://github.com/huggingface/transformers/blob/7b75aa9fa55bee577e2c7403301ed31103125a35/src/transformers/trainer.py#L232
    Setup the optimizer and the learning rate scheduler.

    We provide a reasonable default that works well.
    """
    if isinstance(warmup_steps, float):
        assert 0 < warmup_steps < 1
        warmup_steps = int(num_training_steps * warmup_steps)
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    from transformers import AdamW, get_linear_schedule_with_warmup
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps
    )
    return optimizer, scheduler


def collect_decay_params(model, weight_decay):
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    return optimizer_grouped_parameters


================================================
FILE: hanlp/layers/transformers/utils_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 15:32
import tensorflow as tf
from hanlp.optimizers.adamw import create_optimizer
from hanlp.utils.log_util import logger


def config_is(config, model='bert'):
    return model in type(config).__name__.lower()


def convert_examples_to_features(
        words,
        max_seq_length,
        tokenizer,
        labels=None,
        label_map=None,
        cls_token_at_end=False,
        cls_token="[CLS]",
        cls_token_segment_id=1,
        sep_token="[SEP]",
        sep_token_extra=False,
        pad_on_left=False,
        pad_token_id=0,
        pad_token_segment_id=0,
        pad_token_label_id=0,
        sequence_a_segment_id=0,
        mask_padding_with_zero=True,
        unk_token='[UNK]',
        do_padding=True
):
    """Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)

    Args:
      words: 
      max_seq_length: 
      tokenizer: 
      labels:  (Default value = None)
      label_map:  (Default value = None)
      cls_token_at_end:  (Default value = False)
      cls_token:  (Default value = "[CLS]")
      cls_token_segment_id:  (Default value = 1)
      sep_token:  (Default value = "[SEP]")
      sep_token_extra:  (Default value = False)
      pad_on_left:  (Default value = False)
      pad_token_id:  (Default value = 0)
      pad_token_segment_id:  (Default value = 0)
      pad_token_label_id:  (Default value = 0)
      sequence_a_segment_id:  (Default value = 0)
      mask_padding_with_zero:  (Default value = True)
      unk_token:  (Default value = '[UNK]')
      do_padding:  (Default value = True)

    Returns:

    """
    args = locals()
    if not labels:
        labels = words
        pad_token_label_id = False

    tokens = []
    label_ids = []
    for word, label in zip(words, labels):
        word_tokens = tokenizer.tokenize(word)
        if not word_tokens:
            # some wired chars cause the tagger to return empty list
            word_tokens = [unk_token] * len(word)
        tokens.extend(word_tokens)
        # Use the real label id for the first token of the word, and padding ids for the remaining tokens
        label_ids.extend([label_map[label] if label_map else True] + [pad_token_label_id] * (len(word_tokens) - 1))

    # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
    special_tokens_count = 3 if sep_token_extra else 2
    if len(tokens) > max_seq_length - special_tokens_count:
        logger.warning(
            f'Input tokens {words} exceed the max sequence length of {max_seq_length - special_tokens_count}. '
            f'The exceeded part will be truncated and ignored. '
            f'You are recommended to split your long text into several sentences within '
            f'{max_seq_length - special_tokens_count} tokens beforehand.')
        tokens = tokens[: (max_seq_length - special_tokens_count)]
        label_ids = label_ids[: (max_seq_length - special_tokens_count)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  token_type_ids:   0   0   0   0  0     0   0
    #
    # Where "token_type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens += [sep_token]
    label_ids += [pad_token_label_id]
    if sep_token_extra:
        # roberta uses an extra separator b/w pairs of sentences
        tokens += [sep_token]
        label_ids += [pad_token_label_id]
    segment_ids = [sequence_a_segment_id] * len(tokens)

    if cls_token_at_end:
        tokens += [cls_token]
        label_ids += [pad_token_label_id]
        segment_ids += [cls_token_segment_id]
    else:
        tokens = [cls_token] + tokens
        label_ids = [pad_token_label_id] + label_ids
        segment_ids = [cls_token_segment_id] + segment_ids

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

    if do_padding:
        # Zero-pad up to the sequence length.
        padding_length = max_seq_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token_id] * padding_length) + input_ids
            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
            label_ids = ([pad_token_label_id] * padding_length) + label_ids
        else:
            input_ids += [pad_token_id] * padding_length
            input_mask += [0 if mask_padding_with_zero else 1] * padding_length
            segment_ids += [pad_token_segment_id] * padding_length
            label_ids += [pad_token_label_id] * padding_length

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        assert len(label_ids) == max_seq_length, f'failed for:\n {args}'
    else:
        assert len(set(len(x) for x in [input_ids, input_mask, segment_ids, label_ids])) == 1
    return input_ids, input_mask, segment_ids, label_ids


def build_adamw_optimizer(config, learning_rate, epsilon, clipnorm, train_steps, use_amp, warmup_steps,
                          weight_decay_rate):
    opt = create_optimizer(init_lr=learning_rate,
                           epsilon=epsilon,
                           weight_decay_rate=weight_decay_rate,
                           clipnorm=clipnorm,
                           num_train_steps=train_steps, num_warmup_steps=warmup_steps)
    # opt = tfa.optimizers.AdamW(learning_rate=3e-5, epsilon=1e-08, weight_decay=0.01)
    # opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
    config.optimizer = tf.keras.utils.serialize_keras_object(opt)
    lr_config = config.optimizer['config']['learning_rate']['config']
    if 'decay_schedule_fn' in lr_config:
        lr_config['decay_schedule_fn'] = dict(
            (k, v) for k, v in lr_config['decay_schedule_fn'].items() if not k.startswith('_'))
    if use_amp:
        # loss scaling is currently required when using mixed precision
        opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
    return opt


def adjust_tokens_for_transformers(sentence):
    """Adjust tokens for BERT
    See https://github.com/DoodleJZ/HPSG-Neural-Parser/blob/master/src_joint/Zparser.py#L1204

    Args:
      sentence: 

    Returns:

    
    """
    cleaned_words = []
    for word in sentence:
        # word = BERT_TOKEN_MAPPING.get(word, word)
        if word == "n't" and cleaned_words:
            cleaned_words[-1] = cleaned_words[-1] + "n"
            word = "'t"
        cleaned_words.append(word)
    return cleaned_words


================================================
FILE: hanlp/layers/weight_normalization.py
================================================
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

from hanlp.utils.tf_util import hanlp_register


@hanlp_register
class WeightNormalization(tf.keras.layers.Wrapper):
    """This wrapper reparameterizes a layer by decoupling the weight's
    magnitude and direction.
    
    This speeds up convergence by improving the
    conditioning of the optimization problem.
    Weight Normalization: A Simple Reparameterization to Accelerate
    Training of Deep Neural Networks: https://arxiv.org/abs/1602.07868
    Tim Salimans, Diederik P. Kingma (2016)
    WeightNormalization wrapper works for keras and tf layers.
    ```python
      net = WeightNormalization(
          tf.keras.layers.Conv2D(2, 2, activation='relu'),
          input_shape=(32, 32, 3),
          data_init=True)(x)
      net = WeightNormalization(
          tf.keras.layers.Conv2D(16, 5, activation='relu'),
          data_init=True)(net)
      net = WeightNormalization(
          tf.keras.layers.Dense(120, activation='relu'),
          data_init=True)(net)
      net = WeightNormalization(
          tf.keras.layers.Dense(n_classes),
          data_init=True)(net)
    ```

    Args:
      layer: a layer instance
      data_init: If

    Returns:

    Raises:
      ValueError: If not initialized with a
      ValueError: If
      NotImplementedError: If

    """

    def __init__(self, layer, data_init=True, **kwargs):
        super(WeightNormalization, self).__init__(layer, **kwargs)
        self.data_init = data_init
        self._track_trackable(layer, name='layer')
        self._init_critical_section = tf.CriticalSection(name='init_mutex')
        self.is_rnn = isinstance(self.layer, tf.keras.layers.RNN)

    def build(self, input_shape):
        """Build `Layer`

        Args:
          input_shape: 

        Returns:

        """
        input_shape = tf.TensorShape(input_shape)
        self.input_spec = tf.keras.layers.InputSpec(
            shape=[None] + input_shape[1:])

        if not self.layer.built:
            self.layer.build(input_shape)

        kernel_layer = self.layer.cell if self.is_rnn else self.layer

        if not hasattr(kernel_layer, 'kernel'):
            raise ValueError('`WeightNormalization` must wrap a layer that'
                             ' contains a `kernel` for weights')

        # The kernel's filter or unit dimension is -1
        self.layer_depth = int(kernel_layer.kernel.shape[-1])
        self.kernel_norm_axes = list(range(kernel_layer.kernel.shape.rank - 1))

        self.g = self.add_weight(
            name='g',
            shape=(self.layer_depth,),
            initializer='ones',
            dtype=kernel_layer.kernel.dtype,
            trainable=True)
        self.v = kernel_layer.kernel

        self._initialized = self.add_weight(
            name='initialized',
            shape=None,
            initializer='zeros',
            dtype=tf.dtypes.bool,
            trainable=False)

        if self.data_init:
            # Used for data initialization in self._data_dep_init.
            with tf.name_scope('data_dep_init'):
                layer_config = tf.keras.layers.serialize(self.layer)
                layer_config['config']['trainable'] = False
                self._naked_clone_layer = tf.keras.layers.deserialize(
                    layer_config)
                self._naked_clone_layer.build(input_shape)
                self._naked_clone_layer.set_weights(self.layer.get_weights())
                if self.is_rnn:
                    self._naked_clone_layer.cell.activation = None
                else:
                    self._naked_clone_layer.activation = None

        self.built = True

    def call(self, inputs):
        """Call `Layer`

        Args:
          inputs: 

        Returns:

        """

        def _do_nothing():
            return tf.identity(self.g)

        def _update_weights():
            # Ensure we read `self.g` after _update_weights.
            with tf.control_dependencies(self._initialize_weights(inputs)):
                return tf.identity(self.g)

        g = self._init_critical_section.execute(lambda: tf.cond(
            self._initialized, _do_nothing, _update_weights))

        with tf.name_scope('compute_weights'):
            # Replace kernel by normalized weight variable.
            self.layer.kernel = tf.nn.l2_normalize(
                self.v, axis=self.kernel_norm_axes) * g

            # Ensure we calculate result after updating kernel.
            update_kernel = tf.identity(self.layer.kernel)
            with tf.control_dependencies([update_kernel]):
                outputs = self.layer(inputs)
                return outputs

    def compute_output_shape(self, input_shape):
        return tf.TensorShape(
            self.layer.compute_output_shape(input_shape).as_list())

    def _initialize_weights(self, inputs):
        """Initialize weight g.
        
        The initial value of g could either from the initial value in v,
        or by the input value if self.data_init is True.

        Args:
          inputs: 

        Returns:

        """
        with tf.control_dependencies([
            tf.debugging.assert_equal(  # pylint: disable=bad-continuation
                self._initialized,
                False,
                message='The layer has been initialized.')
        ]):
            if self.data_init:
                assign_tensors = self._data_dep_init(inputs)
            else:
                assign_tensors = self._init_norm()
            assign_tensors.append(self._initialized.assign(True))
            return assign_tensors

    def _init_norm(self):
        """Set the weight g with the norm of the weight vector."""
        with tf.name_scope('init_norm'):
            v_flat = tf.reshape(self.v, [-1, self.layer_depth])
            v_norm = tf.linalg.norm(v_flat, axis=0)
            g_tensor = self.g.assign(tf.reshape(v_norm, (self.layer_depth,)))
            return [g_tensor]

    def _data_dep_init(self, inputs):
        """Data dependent initialization.

        Args:
          inputs: 

        Returns:

        """
        with tf.name_scope('data_dep_init'):
            # Generate data dependent init values
            x_init = self._naked_clone_layer(inputs)
            data_norm_axes = list(range(x_init.shape.rank - 1))
            m_init, v_init = tf.nn.moments(x_init, data_norm_axes)
            scale_init = 1. / tf.math.sqrt(v_init + 1e-10)

            # Assign data dependent init values
            g_tensor = self.g.assign(self.g * scale_init)
            if hasattr(self.layer, 'bias') and self.layer.bias is not None:
                bias_tensor = self.layer.bias.assign(-m_init * scale_init)
                return [g_tensor, bias_tensor]
            else:
                return [g_tensor]

    def get_config(self):
        config = {'data_init': self.data_init}
        base_config = super(WeightNormalization, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


================================================
FILE: hanlp/losses/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-20 01:28

================================================
FILE: hanlp/losses/sparse_categorical_crossentropy.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-20 01:29

import tensorflow as tf

from hanlp.utils.tf_util import hanlp_register


@hanlp_register
class SparseCategoricalCrossentropyOverNonzeroWeights(object):
    def __init__(self) -> None:
        super().__init__()
        self.__name__ = type(self).__name__

    def __call__(self, y_true, y_pred, sample_weight=None, **kwargs):
        loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)
        if sample_weight is not None:
            loss = loss * sample_weight
        loss = tf.reduce_sum(loss)
        if sample_weight is not None:
            # This is equivalent to SUM_OVER_BATCH_SIZE
            # loss /= tf.reduce_sum(tf.ones_like(sample_weight, dtype=loss.dtype))
            # This one is SUM_BY_NONZERO_WEIGHTS
            loss /= tf.reduce_sum(sample_weight)
        return loss


@hanlp_register
class SparseCategoricalCrossentropyOverBatchFirstDim(object):

    def __init__(self) -> None:
        super().__init__()
        self.__name__ = type(self).__name__

    def __call__(self, y_true, y_pred, sample_weight=None, **kwargs):
        loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)
        if sample_weight is not None:
            loss = loss * sample_weight
        # could use sum of sample_weight[:,0] too
        loss = tf.reduce_sum(loss) / tf.cast(tf.shape(y_true)[0], tf.float32)
        return loss

    def get_config(self):
        return {}


@hanlp_register
class MaskedSparseCategoricalCrossentropyOverBatchFirstDim(object):
    def __init__(self, mask_value=0) -> None:
        super().__init__()
        self.mask_value = mask_value
        self.__name__ = type(self).__name__

    def __call__(self, y_true, y_pred, sample_weight=None, **kwargs):
        assert sample_weight is None, 'the mask will be computed via y_true != mask_value, ' \
                                      'it might conflict with sample_weight'
        active_loss = tf.not_equal(y_true, self.mask_value)
        active_labels = tf.boolean_mask(y_true, active_loss)
        active_logits = tf.boolean_mask(y_pred, active_loss)
        loss = tf.keras.losses.sparse_categorical_crossentropy(active_labels, active_logits, from_logits=True)
        loss = tf.reduce_sum(loss) / tf.cast(tf.shape(y_true)[0], tf.float32)
        return loss


================================================
FILE: hanlp/metrics/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-09-14 21:55

================================================
FILE: hanlp/metrics/accuracy.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-12 17:56
from typing import Optional, Iterable

import torch

from hanlp.metrics.metric import Metric


class CategoricalAccuracy(Metric):
    """
    Categorical Top-K accuracy. Assumes integer labels, with
    each item to be classified having a single correct class.
    Tie break enables equal distribution of scores among the
    classes with same maximum predicted scores.
    Copied from AllenNLP and added several methods.
    """

    def __init__(self, top_k: int = 1, tie_break: bool = False) -> None:
        if top_k > 1 and tie_break:
            raise ValueError(
                "Tie break in Categorical Accuracy can be done only for maximum (top_k = 1)"
            )
        if top_k <= 0:
            raise ValueError("top_k passed to Categorical Accuracy must be > 0")
        self._top_k = top_k
        self._tie_break = tie_break
        self.correct_count = 0.0
        self.total_count = 0.0

    def __call__(
            self,
            predictions: torch.Tensor,
            gold_labels: torch.Tensor,
            mask: Optional[torch.BoolTensor] = None,
    ):
        """
        # Parameters

        predictions : `torch.Tensor`, required.
            A tensor of predictions of shape (batch_size, ..., num_classes).
        gold_labels : `torch.Tensor`, required.
            A tensor of integer class label of shape (batch_size, ...). It must be the same
            shape as the `predictions` tensor without the `num_classes` dimension.
        mask : `torch.BoolTensor`, optional (default = `None`).
            A masking tensor the same size as `gold_labels`.
        """
        predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)

        # Some sanity checks.
        num_classes = predictions.size(-1)
        if gold_labels.dim() != predictions.dim() - 1:
            raise ValueError(
                "gold_labels must have dimension == predictions.size() - 1 but "
                "found tensor of shape: {}".format(predictions.size())
            )
        if (gold_labels >= num_classes).any():
            raise ValueError(
                "A gold label passed to Categorical Accuracy contains an id >= {}, "
                "the number of classes.".format(num_classes)
            )

        predictions = predictions.view((-1, num_classes))
        gold_labels = gold_labels.view(-1).long()
        if not self._tie_break:
            # Top K indexes of the predictions (or fewer, if there aren't K of them).
            # Special case topk == 1, because it's common and .max() is much faster than .topk().
            if self._top_k == 1:
                top_k = predictions.max(-1)[1].unsqueeze(-1)
            else:
                top_k = predictions.topk(min(self._top_k, predictions.shape[-1]), -1)[1]

            # This is of shape (batch_size, ..., top_k).
            correct = top_k.eq(gold_labels.unsqueeze(-1)).float()
        else:
            # prediction is correct if gold label falls on any of the max scores. distribute score by tie_counts
            max_predictions = predictions.max(-1)[0]
            max_predictions_mask = predictions.eq(max_predictions.unsqueeze(-1))
            # max_predictions_mask is (rows X num_classes) and gold_labels is (batch_size)
            # ith entry in gold_labels points to index (0-num_classes) for ith row in max_predictions
            # For each row check if index pointed by gold_label is was 1 or not (among max scored classes)
            correct = max_predictions_mask[
                torch.arange(gold_labels.numel(), device=gold_labels.device).long(), gold_labels
            ].float()
            tie_counts = max_predictions_mask.sum(-1)
            correct /= tie_counts.float()
            correct.unsqueeze_(-1)

        if mask is not None:
            correct *= mask.view(-1, 1)
            self.total_count += mask.sum()
        else:
            self.total_count += gold_labels.numel()
        self.correct_count += correct.sum()

    @property
    def score(self):
        if self.total_count > 1e-12:
            accuracy = float(self.correct_count) / float(self.total_count)
        else:
            accuracy = 0.0
        return accuracy

    def __repr__(self) -> str:
        return f'Accuracy:{self.score:.2%}'

    @staticmethod
    def detach_tensors(*tensors: torch.Tensor) -> Iterable[torch.Tensor]:
        """
        If you actually passed gradient-tracking Tensors to a Metric, there will be
        a huge memory leak, because it will prevent garbage collection for the computation
        graph. This method ensures the tensors are detached.
        """
        # Check if it's actually a tensor in case something else was passed.
        return (x.detach() if isinstance(x, torch.Tensor) else x for x in tensors)

    def reset(self):
        self.correct_count = 0.0
        self.total_count = 0.0


class BooleanAccuracy(Metric):
    """
    Just checks batch-equality of two tensors and computes an accuracy metric based on that.
    That is, if your prediction has shape (batch_size, dim_1, ..., dim_n), this metric considers that
    as a set of `batch_size` predictions and checks that each is *entirely* correct across the remaining dims.
    This means the denominator in the accuracy computation is `batch_size`, with the caveat that predictions
    that are totally masked are ignored (in which case the denominator is the number of predictions that have
    at least one unmasked element).

    This is similar to [`CategoricalAccuracy`](./categorical_accuracy.md), if you've already done a `.max()`
    on your predictions.  If you have categorical output, though, you should typically just use
    `CategoricalAccuracy`.  The reason you might want to use this instead is if you've done
    some kind of constrained inference and don't have a prediction tensor that matches the API of
    `CategoricalAccuracy`, which assumes a final dimension of size `num_classes`.
    """

    def __init__(self) -> None:
        self._correct_count = 0.0
        self._total_count = 0.0

    def __call__(
            self,
            predictions: torch.Tensor,
            gold_labels: torch.Tensor,
            mask: Optional[torch.BoolTensor] = None,
    ):
        """
        # Parameters

        predictions : `torch.Tensor`, required.
            A tensor of predictions of shape (batch_size, ...).
        gold_labels : `torch.Tensor`, required.
            A tensor of the same shape as `predictions`.
        mask : `torch.BoolTensor`, optional (default = `None`).
            A tensor of the same shape as `predictions`.
        """
        predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)

        # Some sanity checks.
        if gold_labels.size() != predictions.size():
            raise ValueError(
                f"gold_labels must have shape == predictions.size() but "
                f"found tensor of shape: {gold_labels.size()}"
            )
        if mask is not None and mask.size() != predictions.size():
            raise ValueError(
                f"mask must have shape == predictions.size() but "
                f"found tensor of shape: {mask.size()}"
            )

        batch_size = predictions.size(0)

        if mask is not None:
            # We can multiply by the mask up front, because we're just checking equality below, and
            # this way everything that's masked will be equal.
            predictions = predictions * mask
            gold_labels = gold_labels * mask

            # We want to skip predictions that are completely masked;
            # so we'll keep predictions that aren't.
            keep = mask.view(batch_size, -1).max(dim=1)[0]
        else:
            keep = torch.ones(batch_size, device=predictions.device).bool()

        predictions = predictions.view(batch_size, -1)
        gold_labels = gold_labels.view(batch_size, -1)

        # At this point, predictions is (batch_size, rest_of_dims_combined),
        # so .eq -> .prod will be 1 if every element of the instance prediction is correct
        # and 0 if at least one element of the instance prediction is wrong.
        # Because of how we're handling masking, masked positions are automatically "correct".
        correct = predictions.eq(gold_labels).prod(dim=1).float()

        # Since masked positions are correct, we need to explicitly exclude instance predictions
        # where the entire prediction is masked (because they look "correct").
        self._correct_count += (correct * keep).sum()
        self._total_count += keep.sum()

    def get_metric(self, reset: bool = False):
        """
        # Returns

        The accumulated accuracy.
        """
        if self._total_count > 0:
            accuracy = float(self._correct_count) / float(self._total_count)
        else:
            accuracy = 0.0
        if reset:
            self.reset()
        return accuracy

    def reset(self):
        self._correct_count = 0.0
        self._total_count = 0.0

    @staticmethod
    def detach_tensors(*tensors: torch.Tensor) -> Iterable[torch.Tensor]:
        """
        If you actually passed gradient-tracking Tensors to a Metric, there will be
        a huge memory leak, because it will prevent garbage collection for the computation
        graph. This method ensures the tensors are detached.
        """
        # Check if it's actually a tensor in case something else was passed.
        return (x.detach() if isinstance(x, torch.Tensor) else x for x in tensors)


================================================
FILE: hanlp/metrics/amr/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-24 12:47

================================================
FILE: hanlp/metrics/amr/smatch_eval.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-24 12:47
import os
import warnings
from typing import Union

from hanlp.metrics.f1 import F1_
from hanlp.metrics.mtl import MetricDict
from hanlp.utils.io_util import get_resource, run_cmd, pushd
from hanlp.utils.log_util import flash

_SMATCH_SCRIPT = 'https://github.com/ChunchuanLv/amr-evaluation-tool-enhanced/archive/master.zip#evaluation.sh'
_FAST_SMATCH_SCRIPT = 'https://github.com/jcyk/AMR-gs/archive/master.zip#tools/fast_smatch/compute_smatch.sh'


class SmatchScores(MetricDict):
    @property
    def score(self):
        return self['Smatch'].score


def smatch_eval(pred, gold, use_fast=False) -> Union[SmatchScores, F1_]:
    script = get_resource(_FAST_SMATCH_SCRIPT if use_fast else _SMATCH_SCRIPT)
    home = os.path.dirname(script)
    pred = os.path.realpath(pred)
    gold = os.path.realpath(gold)
    with pushd(home):
        flash('Running evaluation script [blink][yellow]...[/yellow][/blink]')
        cmd = f'bash {script} {pred} {gold}'
        text = run_cmd(cmd)
        flash('')
    return format_fast_scores(text) if use_fast else format_official_scores(text)


def post_process(pred, amr_version):
    pred = os.path.realpath(pred)
    utils_tar_gz = get_amr_utils(amr_version)
    util_dir = get_resource(utils_tar_gz)
    stog_home = get_resource('https://github.com/jcyk/AMR-gs/archive/master.zip')
    with pushd(stog_home):
        run_cmd(
            f'python3 -u -m stog.data.dataset_readers.amr_parsing.postprocess.postprocess '
            f'--amr_path {pred} --util_dir {util_dir} --v 2')
    return pred + '.post'


def get_amr_utils(amr_version):
    if amr_version == '1.0':
        utils_tar_gz = 'https://www.cs.jhu.edu/~s.zhang/data/AMR/amr_1.0_utils.tar.gz'
    elif amr_version == '2.0':
        utils_tar_gz = 'https://www.cs.jhu.edu/~s.zhang/data/AMR/amr_2.0_utils.tar.gz'
    elif amr_version == '3.0':
        utils_tar_gz = 'https://file.hankcs.com/research/amr2020/amr_3.0_utils.tgz'
    else:
        raise ValueError(f'Unsupported AMR version {amr_version}')
    return utils_tar_gz


def format_official_scores(text: str):
    # Smatch -> P: 0.136, R: 0.107, F: 0.120
    # Unlabeled -> P: 0.229, R: 0.180, F: 0.202
    # No WSD -> P: 0.137, R: 0.108, F: 0.120
    # Non_sense_frames -> P: 0.008, R: 0.008, F: 0.008
    # Wikification -> P: 0.000, R: 0.000, F: 0.000
    # Named Ent. -> P: 0.222, R: 0.092, F: 0.130
    # Negations -> P: 0.000, R: 0.000, F: 0.000
    # IgnoreVars -> P: 0.005, R: 0.003, F: 0.003
    # Concepts -> P: 0.075, R: 0.036, F: 0.049
    # Frames -> P: 0.007, R: 0.007, F: 0.007
    # Reentrancies -> P: 0.113, R: 0.060, F: 0.079
    # SRL -> P: 0.145, R: 0.104, F: 0.121
    scores = SmatchScores()
    for line in text.split('\n'):
        line = line.strip()
        if not line:
            continue
        name, vs = line.split(' -> ')
        try:
            p, r, f = [float(x.split(': ')[-1]) for x in vs.split(', ')]
        except ValueError:
            warnings.warn(f'Failed to parse results from smatch: {line}')
            p, r, f = float("nan"), float("nan"), float("nan")
        scores[name] = F1_(p, r, f)
    return scores


def format_fast_scores(text: str):
    # using fast smatch
    # Precision: 0.137
    # Recall: 0.108
    # Document F-score: 0.121
    scores = []
    for line in text.split('\n'):
        line = line.strip()
        if not line or ':' not in line:
            continue
        name, score = line.split(': ')
        scores.append(float(score))
    assert len(scores) == 3
    return F1_(*scores)


================================================
FILE: hanlp/metrics/chunking/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 03:49

================================================
FILE: hanlp/metrics/chunking/binary_chunking_f1.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-02 14:27
from collections import defaultdict
from typing import List, Union

import torch

from hanlp.metrics.f1 import F1


class BinaryChunkingF1(F1):
    def __call__(self, pred_tags: torch.LongTensor, gold_tags: torch.LongTensor, lens: List[int] = None):
        if lens is None:
            lens = [gold_tags.size(1)] * gold_tags.size(0)
        self.update(self.decode_spans(pred_tags, lens), self.decode_spans(gold_tags, lens))

    def update(self, pred_tags, gold_tags):
        for pred, gold in zip(pred_tags, gold_tags):
            super().__call__(set(pred), set(gold))

    @staticmethod
    def decode_spans(pred_tags: torch.LongTensor, lens: Union[List[int], torch.LongTensor]):
        if isinstance(lens, torch.Tensor):
            lens = lens.tolist()
        batch_pred = defaultdict(list)
        for batch, offset in pred_tags.nonzero(as_tuple=False).tolist():
            batch_pred[batch].append(offset)
        batch_pred_spans = [[(0, l)] for l in lens]
        for batch, offsets in batch_pred.items():
            l = lens[batch]
            batch_pred_spans[batch] = list(zip(offsets, offsets[1:] + [l]))
        return batch_pred_spans


================================================
FILE: hanlp/metrics/chunking/bmes_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-09-14 21:55

from hanlp.common.vocab_tf import VocabTF
from hanlp.metrics.chunking.chunking_f1_tf import ChunkingF1_TF
from hanlp.metrics.chunking.sequence_labeling import get_entities


class BMES_F1_TF(ChunkingF1_TF):

    def __init__(self, tag_vocab: VocabTF, from_logits=True, suffix=False, name='f1', dtype=None, **kwargs):
        super().__init__(tag_vocab, from_logits, name, dtype, **kwargs)
        self.nb_correct = 0
        self.nb_pred = 0
        self.nb_true = 0
        self.suffix = suffix

    def update_tags(self, true_tags, pred_tags):
        for t, p in zip(true_tags, pred_tags):
            self.update_entities(get_entities(t, self.suffix), get_entities(p, self.suffix))
        return self.result()

    def update_entities(self, true_entities, pred_entities):
        true_entities = set(true_entities)
        pred_entities = set(pred_entities)
        nb_correct = len(true_entities & pred_entities)
        nb_pred = len(pred_entities)
        nb_true = len(true_entities)
        self.nb_correct += nb_correct
        self.nb_pred += nb_pred
        self.nb_true += nb_true

    def result(self):
        nb_correct = self.nb_correct
        nb_pred = self.nb_pred
        nb_true = self.nb_true
        p = nb_correct / nb_pred if nb_pred > 0 else 0
        r = nb_correct / nb_true if nb_true > 0 else 0
        score = 2 * p * r / (p + r) if p + r > 0 else 0

        return score

    def reset_states(self):
        self.nb_correct = 0
        self.nb_pred = 0
        self.nb_true = 0


================================================
FILE: hanlp/metrics/chunking/chunking_f1.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-11 22:14
import io
from collections import defaultdict
from typing import List, Set, Tuple, Dict

from hanlp.metrics.chunking.conlleval import calculate_metrics, DetailedF1, metrics
from hanlp.metrics.chunking.sequence_labeling import get_entities
from hanlp.metrics.f1 import F1
from hanlp.metrics.metric import Metric


class ChunkingF1(F1):

    def __call__(self, pred_tags: List[List[str]], gold_tags: List[List[str]]):
        for p, g in zip(pred_tags, gold_tags):
            pred = set(get_entities(p))
            gold = set(get_entities(g))
            self.nb_pred += len(pred)
            self.nb_true += len(gold)
            self.nb_correct += len(pred & gold)


class DetailedSpanF1(Metric):
    def __init__(self, do_confusion_matrix=False):
        self.correct_chunk = 0  # number of correctly identified chunks
        self.correct_unlabeled = 0
        self.total_gold = 0  # number of chunks in corpus
        self.total_pred = 0  # number of identified chunks
        self.token_counter = 0  # token counter (ignores sentence breaks)

        # counts by type
        self.t_correct_chunk = defaultdict(int)
        self.t_total_gold = defaultdict(int)
        self.t_total_pred = defaultdict(int)

        self.do_confusion_matrix = do_confusion_matrix
        if do_confusion_matrix:
            self.pred_labels = []
            self.gold_labels = []

    @property
    def states(self):
        return (self.t_correct_chunk, self.t_total_gold, self.t_total_pred)

    def reset_state(self):
        self.correct_chunk = 0  # number of correctly identified chunks
        self.total_gold = 0  # number of chunks in corpus
        self.total_pred = 0  # number of identified chunks
        self.token_counter = 0  # token counter (ignores sentence breaks)
        for state in self.states:
            state.clear()
        if self.do_confusion_matrix:
            self.pred_labels = []
            self.gold_labels = []

    @property
    def score(self):
        overall = calculate_metrics(
            self.correct_chunk, self.total_pred, self.total_gold
        )
        return overall.fscore

    def __call__(self, pred: Set[Tuple[int, int, str]], gold: Set[Tuple[int, int, str]], num_tokens=None):
        pred_chunks_unlabeled = set((b, e) for b, e, l in pred)
        gold_chunks_unlabeled = set((b, e) for b, e, l in gold)
        self.correct_unlabeled += len(pred_chunks_unlabeled & gold_chunks_unlabeled)
        self.correct_chunk += len(pred & gold)
        self.total_gold += len(gold)
        self.total_pred += len(pred)
        if num_tokens:
            self.token_counter += num_tokens

        def group_by_tag(collection: Set[Tuple[int, int, str]]):
            group = defaultdict(set)
            for b, e, l in collection:
                group[l].add((b, e))
            return group

        pred_tags = group_by_tag(pred)
        gold_tags = group_by_tag(gold)
        for l in pred_tags.keys() | gold_tags.keys():
            self.t_correct_chunk[l] += len(pred_tags[l] & gold_tags[l])
            self.t_total_gold[l] += len(gold_tags[l])
            self.t_total_pred[l] += len(pred_tags[l])

        if self.do_confusion_matrix:
            def group_by_span(collection: Set[Tuple[int, int, str]]):
                group = dict()
                for b, e, l in collection:
                    group[(b, e)] = l
                return group

            pred_spans = group_by_span(pred)
            gold_spans = group_by_span(gold)
            for span in pred_spans.keys() & gold_spans.keys():
                self.pred_labels.append(pred_spans[span])
                self.gold_labels.append(gold_spans[span])

    def reset(self):
        self.reset_state()

    def report(self) -> Tuple[DetailedF1, Dict[str, DetailedF1], str]:
        out = io.StringIO()

        c = self
        out.write('processed %d tokens with %d phrases; ' % (c.token_counter, c.total_gold))
        out.write('found: %d phrases; correct: %d.\n' % (c.total_pred, c.correct_chunk))

        overall = calculate_metrics(c.correct_unlabeled, c.total_pred, c.total_gold)
        out.write('%17s: ' % 'unlabeled overall')
        out.write('precision: %6.2f%%; ' % (100. * overall.prec))
        out.write('recall: %6.2f%%; ' % (100. * overall.rec))
        out.write('FB1: %6.2f\n' % (100. * overall.fscore))

        overall, by_type = metrics(self)
        out.write('%17s: ' % 'labeled overall')
        out.write('precision: %6.2f%%; ' % (100. * overall.prec))
        out.write('recall: %6.2f%%; ' % (100. * overall.rec))
        out.write('FB1: %6.2f\n' % (100. * overall.fscore))

        for i, m in sorted(by_type.items()):
            out.write('%17s: ' % i)
            out.write('precision: %6.2f%%; ' % (100. * m.prec))
            out.write('recall: %6.2f%%; ' % (100. * m.rec))
            out.write('FB1: %6.2f  %d\n' % (100. * m.fscore, c.t_total_pred[i]))
        text = out.getvalue()
        out.close()
        return overall, by_type, text

    def __str__(self) -> str:
        return self.report()[-1]

    def confusion_matrix(self):
        from sklearn.metrics import confusion_matrix
        labels = sorted(self.gold_labels + self.pred_labels)
        return confusion_matrix(self.gold_labels, self.pred_labels, labels=labels), labels


================================================
FILE: hanlp/metrics/chunking/chunking_f1_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 23:09
from abc import ABC, abstractmethod

import tensorflow as tf

from hanlp.common.vocab_tf import VocabTF


class ChunkingF1_TF(tf.keras.metrics.Metric, ABC):

    def __init__(self, tag_vocab: VocabTF, from_logits=True, name='f1', dtype=None, **kwargs):
        super().__init__(name, dtype, dynamic=True, **kwargs)
        self.tag_vocab = tag_vocab
        self.from_logits = from_logits

    def update_the_state(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight: tf.Tensor = None, **kwargs):
        if sample_weight is None:
            if hasattr(y_pred, '_keras_mask'):
                mask = y_pred._keras_mask
            else:
                mask = None
        else:
            mask = sample_weight
        if self.tag_vocab.pad_idx is not None and mask is None:
            # in this case, the model doesn't compute mask but provide a masking index, it's ok to
            mask = y_true != self.tag_vocab.pad_idx
        assert mask is not None, 'ChunkingF1 requires masking, check your _keras_mask or compute_mask'
        if self.from_logits:
            y_pred = tf.argmax(y_pred, axis=-1)
        y_true = self.to_tags(y_true, mask)
        y_pred = self.to_tags(y_pred, mask)
        return self.update_tags(y_true, y_pred)

    def __call__(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight: tf.Tensor = None, **kwargs):
        return self.update_the_state(y_true, y_pred, sample_weight)

    def update_state(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight: tf.Tensor = None, **kwargs):
        return self.update_the_state(y_true, y_pred, sample_weight)

    def to_tags(self, y: tf.Tensor, sample_weight: tf.Tensor):
        batch = []
        y = y.numpy()
        sample_weight = sample_weight.numpy()
        for sent, mask in zip(y, sample_weight):
            tags = []
            for tag, m in zip(sent, mask):
                if not m:
                    continue
                tag = int(tag)
                if self.tag_vocab.pad_idx is not None and tag == self.tag_vocab.pad_idx:
                    # If model predicts <pad>, it will fail most metrics. So replace it with a valid one
                    tag = 1
                tags.append(self.tag_vocab.get_token(tag))
            batch.append(tags)
        return batch

    @abstractmethod
    def update_tags(self, true_tags, pred_tags):
        pass

    @abstractmethod
    def result(self):
        pass


================================================
FILE: hanlp/metrics/chunking/conlleval.py
================================================
#!/usr/bin/env python

# Python version of the evaluation script from CoNLL'00-

# Intentional differences:
# - accept any space as delimiter by default
# - optional file argument (default STDIN)
# - option to set boundary (-b argument)
# - LaTeX output (-l argument) not supported
# - raw tags (-r argument) not supported
import io
import sys

from collections import defaultdict, namedtuple
from typing import Tuple, Union, List

from hanlp.utils.span_util import bio_tags_to_spans

from hanlp.metrics.metric import Metric

ANY_SPACE = '<SPACE>'


class FormatError(Exception):
    pass


DetailedF1 = namedtuple('Metrics', 'tp fp fn prec rec fscore')


class EvalCounts(object):
    def __init__(self):
        self.correct_chunk = 0  # number of correctly identified chunks
        self.correct_tags = 0  # number of correct chunk tags
        self.total_gold = 0  # number of chunks in corpus
        self.total_pred = 0  # number of identified chunks
        self.token_counter = 0  # token counter (ignores sentence breaks)

        # counts by type
        self.t_correct_chunk = defaultdict(int)
        self.t_total_gold = defaultdict(int)
        self.t_total_pred = defaultdict(int)

    @property
    def states(self):
        return (self.t_correct_chunk, self.t_total_gold, self.t_total_pred)

    def reset_state(self):
        self.correct_chunk = 0  # number of correctly identified chunks
        self.correct_tags = 0  # number of correct chunk tags
        self.total_gold = 0  # number of chunks in corpus
        self.total_pred = 0  # number of identified chunks
        self.token_counter = 0  # token counter (ignores sentence breaks)
        for state in self.states:
            state.clear()


class SpanF1(Metric):

    def __init__(self, label_encoding='IOBES') -> None:
        super().__init__()
        self.label_encoding = label_encoding
        self.count = EvalCounts()

    def reset(self):
        self.count = EvalCounts()

    @property
    def score(self):
        return self.result(False, False).fscore

    def reset_state(self):
        self.count.reset_state()

    def update_state(self, true_seqs: List[str], pred_seqs: List[str]):
        if self.label_encoding == 'IOBES':
            count = evaluate_iobes(true_seqs, pred_seqs)
        elif self.label_encoding in ['IOB2', 'BIO']:
            count = evaluate_iob2(true_seqs, pred_seqs)
        else:
            raise ValueError(f'Unrecognized label encoding {self.label_encoding}')
        self.count.correct_chunk += count.correct_chunk
        self.count.correct_tags += count.correct_tags
        self.count.total_gold += count.total_gold
        self.count.total_pred += count.total_pred
        self.count.token_counter += count.token_counter
        for s, n in zip(self.count.states, count.states):
            for k, v in n.items():
                s[k] = s.get(k, 0) + v

    def batch_update_state(self, true_seqs: List[List[str]], pred_seqs: List[List[str]]):
        for t, p in zip(true_seqs, pred_seqs):
            self.update_state(t, p)

    def result(self, full=True, verbose=True) -> Union[Tuple[DetailedF1, dict, str], DetailedF1]:
        if full:
            out = io.StringIO()
            overall, by_type = report(self.count, out)
            text = out.getvalue()
            if verbose:
                print(text)
            out.close()
            return overall, by_type, text
        else:
            overall, _ = metrics(self.count)
            return overall

    # torch convention: put pred before gold
    def __call__(self, pred_seqs: List[List[str]], true_seqs: List[List[str]]):
        return self.batch_update_state(true_seqs, pred_seqs)

    def __repr__(self) -> str:
        result = self.result(False, False)
        return f"P: {result.prec:.2%} R: {result.rec:.2%} F: {result.fscore:.2%}"


def parse_args(argv):
    import argparse
    parser = argparse.ArgumentParser(
        description='evaluate tagging results using CoNLL criteria',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    arg = parser.add_argument
    arg('-b', '--boundary', metavar='STR', default='-X-',
        help='sentence boundary')
    arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE,
        help='character delimiting items in input')
    arg('-o', '--otag', metavar='CHAR', default='O',
        help='alternative outside tag')
    arg('file', nargs='?', default=None)
    return parser.parse_args(argv)


def split_tag(chunk_tag):
    """split chunk tag into IOBES prefix and chunk_type
    e.g.
    B-PER -> (B, PER)
    O -> (O, None)

    Args:
      chunk_tag: 

    Returns:

    """
    if chunk_tag == 'O':
        return ('O', None)
    return chunk_tag.split('-', maxsplit=1)


def evaluate_iobes(true_seqs, pred_seqs):
    counts = EvalCounts()
    in_correct = False  # currently processed chunks is correct until now
    last_correct = 'O'  # previous chunk tag in corpus
    last_correct_type = ''  # type of previously identified chunk tag
    last_guessed = 'O'  # previously identified chunk tag
    last_guessed_type = ''  # type of previous chunk tag in corpus

    for true_tag, pred_tag in zip(true_seqs, pred_seqs):

        guessed, guessed_type = split_tag(pred_tag)
        correct, correct_type = split_tag(true_tag)

        end_correct = end_of_chunk(last_correct, correct,
                                   last_correct_type, correct_type)
        end_guessed = end_of_chunk(last_guessed, guessed,
                                   last_guessed_type, guessed_type)
        start_correct = start_of_chunk(last_correct, correct,
                                       last_correct_type, correct_type)
        start_guessed = start_of_chunk(last_guessed, guessed,
                                       last_guessed_type, guessed_type)

        if in_correct:
            if (end_correct and end_guessed and
                    last_guessed_type == last_correct_type):
                in_correct = False
                counts.correct_chunk += 1
                counts.t_correct_chunk[last_correct_type] += 1
            elif (end_correct != end_guessed or guessed_type != correct_type):
                in_correct = False

        if start_correct and start_guessed and guessed_type == correct_type:
            in_correct = True

        if start_correct:
            counts.total_gold += 1
            counts.t_total_gold[correct_type] += 1
        if start_guessed:
            counts.total_pred += 1
            counts.t_total_pred[guessed_type] += 1
        if correct == guessed and guessed_type == correct_type:
            counts.correct_tags += 1
        counts.token_counter += 1

        last_guessed = guessed
        last_correct = correct
        last_guessed_type = guessed_type
        last_correct_type = correct_type

    if in_correct:
        counts.correct_chunk += 1
        counts.t_correct_chunk[last_correct_type] += 1

    return counts


def evaluate_iob2(true_seqs, pred_seqs):
    counts = EvalCounts()
    gold = set(bio_tags_to_spans(true_seqs))
    pred = set(bio_tags_to_spans(pred_seqs))
    counts.correct_chunk = len(gold & pred)
    counts.total_pred = len(pred)
    counts.total_gold = len(gold)
    return counts


def uniq(iterable):
    seen = set()
    return [i for i in iterable if not (i in seen or seen.add(i))]


def calculate_metrics(correct, guessed, total):
    tp, fp, fn = correct, guessed - correct, total - correct
    p = 0. if tp + fp == 0 else 1. * tp / (tp + fp)
    r = 0. if tp + fn == 0 else 1. * tp / (tp + fn)
    f = 0. if p + r == 0 else 2 * p * r / (p + r)
    return DetailedF1(tp, fp, fn, p, r, f)


def calc_metrics(tp, p, t, percent=True):
    """compute overall precision, recall and FB1 (default values are 0.0)
    if percent is True, return 100 * original decimal value

    Args:
      tp: 
      p: 
      t: 
      percent:  (Default value = True)

    Returns:

    """
    precision = tp / p if p else 0
    recall = tp / t if t else 0
    fb1 = 2 * precision * recall / (precision + recall) if precision + recall else 0
    if percent:
        return 100 * precision, 100 * recall, 100 * fb1
    else:
        return precision, recall, fb1


def metrics(counts):
    c = counts
    overall = calculate_metrics(
        c.correct_chunk, c.total_pred, c.total_gold
    )
    by_type = {}
    for t in uniq(list(c.t_total_gold.keys()) + list(c.t_total_pred.keys())):
        by_type[t] = calculate_metrics(
            c.t_correct_chunk[t], c.t_total_pred[t], c.t_total_gold[t]
        )
    return overall, by_type


def report(counts, out=None):
    if out is None:
        out = sys.stdout

    overall, by_type = metrics(counts)

    c = counts
    out.write('processed %d tokens with %d phrases; ' %
              (c.token_counter, c.total_gold))
    out.write('found: %d phrases; correct: %d.\n' %
              (c.total_pred, c.correct_chunk))

    if c.token_counter > 0:
        out.write('accuracy: %6.2f%%; ' %
                  (100. * c.correct_tags / c.token_counter))
        out.write('precision: %6.2f%%; ' % (100. * overall.prec))
        out.write('recall: %6.2f%%; ' % (100. * overall.rec))
        out.write('FB1: %6.2f\n' % (100. * overall.fscore))

    for i, m in sorted(by_type.items()):
        out.write('%17s: ' % i)
        out.write('precision: %6.2f%%; ' % (100. * m.prec))
        out.write('recall: %6.2f%%; ' % (100. * m.rec))
        out.write('FB1: %6.2f  %d\n' % (100. * m.fscore, c.t_total_pred[i]))
    return overall, by_type


def end_of_chunk(prev_tag, tag, prev_type, type_):
    # check if a chunk ended between the previous and current word
    # arguments: previous and current chunk tags, previous and current types
    return ((prev_tag == "B" and tag == "B") or
            (prev_tag == "B" and tag == "O") or
            (prev_tag == "I" and tag == "B") or
            (prev_tag == "I" and tag == "O") or

            (prev_tag == "E" and tag == "E") or
            (prev_tag == "E" and tag == "I") or
            (prev_tag == "E" and tag == "O") or
            (prev_tag == "I" and tag == "O") or

            (prev_tag != "O" and prev_tag != "." and prev_type != type_) or
            (prev_tag == "]" or prev_tag == "["))


def start_of_chunk(prev_tag, tag, prev_type, type_):
    # check if a chunk started between the previous and current word
    # arguments: previous and current chunk tags, previous and current types
    chunkStart = ((prev_tag == "B" and tag == "B") or
                  (prev_tag == "B" and tag == "B") or
                  (prev_tag == "I" and tag == "B") or
                  (prev_tag == "O" and tag == "B") or
                  (prev_tag == "O" and tag == "I") or

                  (prev_tag == "E" and tag == "E") or
                  (prev_tag == "E" and tag == "I") or
                  (prev_tag == "O" and tag == "E") or
                  (prev_tag == "O" and tag == "I") or

                  (tag != "O" and tag != "." and prev_type != type_) or
                  (tag == "]" or tag == "["))
    # corrected 1998-12-22: these chunks are assumed to have length 1

    # print("startOfChunk?", prevTag, tag, prevType, type)
    # print(chunkStart)
    return chunkStart


def main(argv):
    args = parse_args(argv[1:])

    if args.file is None:
        counts = evaluate_iobes(sys.stdin, args)
    else:
        with open(args.file, encoding='utf-8') as f:
            counts = evaluate_iobes(f, args)
    report(counts)


if __name__ == '__main__':
    sys.exit(main(sys.argv))


================================================
FILE: hanlp/metrics/chunking/iobes_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-09-14 21:55

from hanlp.common.vocab_tf import VocabTF
from hanlp.metrics.chunking.conlleval import SpanF1
from hanlp.metrics.chunking.chunking_f1_tf import ChunkingF1_TF


class IOBES_F1_TF(ChunkingF1_TF):

    def __init__(self, tag_vocab: VocabTF, from_logits=True, name='f1', dtype=None, **kwargs):
        super().__init__(tag_vocab, from_logits, name, dtype, **kwargs)
        self.state = SpanF1()

    def update_tags(self, true_tags, pred_tags):
        # true_tags = list(itertools.chain.from_iterable(true_tags))
        # pred_tags = list(itertools.chain.from_iterable(pred_tags))
        # self.state.update_state(true_tags, pred_tags)
        for gold, pred in zip(true_tags, pred_tags):
            self.state.update_state(gold, pred)
        return self.result()

    def result(self):
        return self.state.result(full=False, verbose=False).fscore

    def reset_states(self):
        self.state.reset_state()


================================================
FILE: hanlp/metrics/chunking/sequence_labeling.py
================================================
# MIT License
#
# Copyright (c) 2018 chakki
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""Metrics to assess performance on sequence labeling task given prediction
Functions named as ``*_score`` return a scalar value to maximize: the higher
the better
"""

from collections import defaultdict
import numpy as np


def iobes_to_span(words, tags):
    delimiter = ' '
    if all([len(w) == 1 for w in words]):
        delimiter = ''  # might be Chinese
    entities = []
    for tag, start, end in get_entities(tags):
        entities.append((delimiter.join(words[start:end]), tag, start, end))
    yield entities


def get_entities(seq, suffix=False):
    """Gets entities from sequence.

    Args:
      seq(list): sequence of labels.
      suffix:  (Default value = False)

    Returns:
      list: list of (chunk_type, chunk_start, chunk_end).
      Example:

    >>> from seqeval.metrics.sequence_labeling import get_entities
        >>> seq = ['B-PER', 'I-PER', 'O', 'B-LOC']
        >>> get_entities(seq)
        [('PER', 0, 2), ('LOC', 3, 4)]
    """
    # for nested list
    if any(isinstance(s, list) for s in seq):
        seq = [item for sublist in seq for item in sublist + ['O']]

    prev_tag = 'O'
    prev_type = ''
    begin_offset = 0
    chunks = []
    for i, chunk in enumerate(seq + ['O']):
        if suffix:
            tag = chunk[-1]
            type_ = chunk[:-2]
        else:
            tag = chunk[0]
            type_ = chunk[2:]

        if end_of_chunk(prev_tag, tag, prev_type, type_):
            chunks.append((prev_type, begin_offset, i))
        if start_of_chunk(prev_tag, tag, prev_type, type_):
            begin_offset = i
        prev_tag = tag
        prev_type = type_

    return chunks


def end_of_chunk(prev_tag, tag, prev_type, type_):
    """Checks if a chunk ended between the previous and current word.

    Args:
      prev_tag: previous chunk tag.
      tag: current chunk tag.
      prev_type: previous type.
      type_: current type.

    Returns:
      chunk_end: boolean.

    """
    chunk_end = False

    if prev_tag == 'E': chunk_end = True
    if prev_tag == 'S': chunk_end = True

    if prev_tag == 'B' and tag == 'B': chunk_end = True
    if prev_tag == 'B' and tag == 'S': chunk_end = True
    if prev_tag == 'B' and tag == 'O': chunk_end = True
    if prev_tag == 'I' and tag == 'B': chunk_end = True
    if prev_tag == 'I' and tag == 'S': chunk_end = True
    if prev_tag == 'I' and tag == 'O': chunk_end = True

    if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
        chunk_end = True

    return chunk_end


def start_of_chunk(prev_tag, tag, prev_type, type_):
    """Checks if a chunk started between the previous and current word.

    Args:
      prev_tag: previous chunk tag.
      tag: current chunk tag.
      prev_type: previous type.
      type_: current type.

    Returns:
      chunk_start: boolean.

    """
    chunk_start = False

    if tag == 'B': chunk_start = True
    if tag == 'S': chunk_start = True

    if prev_tag == 'E' and tag == 'E': chunk_start = True
    if prev_tag == 'E' and tag == 'I': chunk_start = True
    if prev_tag == 'S' and tag == 'E': chunk_start = True
    if prev_tag == 'S' and tag == 'I': chunk_start = True
    if prev_tag == 'O' and tag == 'E': chunk_start = True
    if prev_tag == 'O' and tag == 'I': chunk_start = True

    if tag != 'O' and tag != '.' and prev_type != type_:
        chunk_start = True

    return chunk_start


def f1_score(y_true, y_pred, average='micro', suffix=False):
    """Compute the F1 score.
    
    The F1 score can be interpreted as a weighted average of the precision and
    recall, where an F1 score reaches its best value at 1 and worst score at 0.
    The relative contribution of precision and recall to the F1 score are
    equal. The formula for the F1 score is::
    
        F1 = 2 * (precision * recall) / (precision + recall)

    Args:
      y_true: 2d array. Ground truth (correct) target values.
      y_pred: 2d array. Estimated targets as returned by a tagger.
      average:  (Default value = 'micro')
      suffix:  (Default value = False)

    Returns:
      score: float.
      Example:

    >>> from seqeval.metrics import f1_score
        >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        >>> f1_score(y_true, y_pred)
        0.50
    """
    true_entities = set(get_entities(y_true, suffix))
    pred_entities = set(get_entities(y_pred, suffix))

    nb_correct = len(true_entities & pred_entities)
    nb_pred = len(pred_entities)
    nb_true = len(true_entities)

    p = nb_correct / nb_pred if nb_pred > 0 else 0
    r = nb_correct / nb_true if nb_true > 0 else 0
    score = 2 * p * r / (p + r) if p + r > 0 else 0

    return score


def accuracy_score(y_true, y_pred):
    """Accuracy classification score.
    
    In multilabel classification, this function computes subset accuracy:
    the set of labels predicted for a sample must *exactly* match the
    corresponding set of labels in y_true.

    Args:
      y_true: 2d array. Ground truth (correct) target values.
      y_pred: 2d array. Estimated targets as returned by a tagger.

    Returns:
      score: float.
      Example:

    >>> from seqeval.metrics import accuracy_score
        >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        >>> accuracy_score(y_true, y_pred)
        0.80
    """
    if any(isinstance(s, list) for s in y_true):
        y_true = [item for sublist in y_true for item in sublist]
        y_pred = [item for sublist in y_pred for item in sublist]

    nb_correct = sum(y_t == y_p for y_t, y_p in zip(y_true, y_pred))
    nb_true = len(y_true)

    score = nb_correct / nb_true

    return score


def precision_score(y_true, y_pred, average='micro', suffix=False):
    """Compute the precision.
    
    The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
    true positives and ``fp`` the number of false positives. The precision is
    intuitively the ability of the classifier not to label as positive a sample.
    
    The best value is 1 and the worst value is 0.

    Args:
      y_true: 2d array. Ground truth (correct) target values.
      y_pred: 2d array. Estimated targets as returned by a tagger.
      average:  (Default value = 'micro')
      suffix:  (Default value = False)

    Returns:
      score: float.
      Example:

    >>> from seqeval.metrics import precision_score
        >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        >>> precision_score(y_true, y_pred)
        0.50
    """
    true_entities = set(get_entities(y_true, suffix))
    pred_entities = set(get_entities(y_pred, suffix))

    nb_correct = len(true_entities & pred_entities)
    nb_pred = len(pred_entities)

    score = nb_correct / nb_pred if nb_pred > 0 else 0

    return score


def recall_score(y_true, y_pred, average='micro', suffix=False):
    """Compute the recall.
    
    The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
    true positives and ``fn`` the number of false negatives. The recall is
    intuitively the ability of the classifier to find all the positive samples.
    
    The best value is 1 and the worst value is 0.

    Args:
      y_true: 2d array. Ground truth (correct) target values.
      y_pred: 2d array. Estimated targets as returned by a tagger.
      average:  (Default value = 'micro')
      suffix:  (Default value = False)

    Returns:
      score: float.
      Example:

    >>> from seqeval.metrics import recall_score
        >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        >>> recall_score(y_true, y_pred)
        0.50
    """
    true_entities = set(get_entities(y_true, suffix))
    pred_entities = set(get_entities(y_pred, suffix))

    nb_correct = len(true_entities & pred_entities)
    nb_true = len(true_entities)

    score = nb_correct / nb_true if nb_true > 0 else 0

    return score


def performance_measure(y_true, y_pred):
    """Compute the performance metrics: TP, FP, FN, TN

    Args:
      y_true: 2d array. Ground truth (correct) target values.
      y_pred: 2d array. Estimated targets as returned by a tagger.

    Returns:
      performance_dict: dict
      Example:

    >>> from seqeval.metrics import performance_measure
        >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'B-ORG'], ['B-PER', 'I-PER', 'O']]
        >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'O'], ['B-PER', 'I-PER', 'O']]
        >>> performance_measure(y_true, y_pred)
        (3, 3, 1, 4)
    """
    performace_dict = dict()
    if any(isinstance(s, list) for s in y_true):
        y_true = [item for sublist in y_true for item in sublist]
        y_pred = [item for sublist in y_pred for item in sublist]
    performace_dict['TP'] = sum(y_t == y_p for y_t, y_p in zip(y_true, y_pred)
                                if ((y_t != 'O') or (y_p != 'O')))
    performace_dict['FP'] = sum(y_t != y_p for y_t, y_p in zip(y_true, y_pred))
    performace_dict['FN'] = sum(((y_t != 'O') and (y_p == 'O'))
                                for y_t, y_p in zip(y_true, y_pred))
    performace_dict['TN'] = sum((y_t == y_p == 'O')
                                for y_t, y_p in zip(y_true, y_pred))

    return performace_dict


def classification_report(y_true, y_pred, digits=2, suffix=False):
    """Build a text report showing the main classification metrics.

    Args:
      y_true: 2d array. Ground truth (correct) target values.
      y_pred: 2d array. Estimated targets as returned by a classifier.
      digits: int. Number of digits for formatting output floating point values. (Default value = 2)
      suffix:  (Default value = False)

    Returns:
      report: string. Text summary of the precision, recall, F1 score for each class.
      Examples:

    >>> from seqeval.metrics import classification_report
        >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        >>> print(classification_report(y_true, y_pred))
                     precision    recall  f1-score   support
        <BLANKLINE>
               MISC       0.00      0.00      0.00         1
                PER       1.00      1.00      1.00         1
        <BLANKLINE>
          micro avg       0.50      0.50      0.50         2
          macro avg       0.50      0.50      0.50         2
        <BLANKLINE>
    """
    true_entities = set(get_entities(y_true, suffix))
    pred_entities = set(get_entities(y_pred, suffix))

    name_width = 0
    d1 = defaultdict(set)
    d2 = defaultdict(set)
    for e in true_entities:
        d1[e[0]].add((e[1], e[2]))
        name_width = max(name_width, len(e[0]))
    for e in pred_entities:
        d2[e[0]].add((e[1], e[2]))

    last_line_heading = 'macro avg'
    width = max(name_width, len(last_line_heading), digits)

    headers = ["precision", "recall", "f1-score", "support"]
    head_fmt = u'{:>{width}s} ' + u' {:>9}' * len(headers)
    report = head_fmt.format(u'', *headers, width=width)
    report += u'\n\n'

    row_fmt = u'{:>{width}s} ' + u' {:>9.{digits}f}' * 3 + u' {:>9}\n'

    ps, rs, f1s, s = [], [], [], []
    for type_name, true_entities in d1.items():
        pred_entities = d2[type_name]
        nb_correct = len(true_entities & pred_entities)
        nb_pred = len(pred_entities)
        nb_true = len(true_entities)

        p = nb_correct / nb_pred if nb_pred > 0 else 0
        r = nb_correct / nb_true if nb_true > 0 else 0
        f1 = 2 * p * r / (p + r) if p + r > 0 else 0

        report += row_fmt.format(*[type_name, p, r, f1, nb_true], width=width, digits=digits)

        ps.append(p)
        rs.append(r)
        f1s.append(f1)
        s.append(nb_true)

    report += u'\n'

    # compute averages
    report += row_fmt.format('micro avg',
                             precision_score(y_true, y_pred, suffix=suffix),
                             recall_score(y_true, y_pred, suffix=suffix),
                             f1_score(y_true, y_pred, suffix=suffix),
                             np.sum(s),
                             width=width, digits=digits)
    report += row_fmt.format(last_line_heading,
                             np.average(ps, weights=s),
                             np.average(rs, weights=s),
                             np.average(f1s, weights=s),
                             np.sum(s),
                             width=width, digits=digits)

    return report


================================================
FILE: hanlp/metrics/f1.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-10 14:55
from abc import ABC

from hanlp.metrics.metric import Metric


class F1(Metric, ABC):
    def __init__(self, nb_pred=0, nb_true=0, nb_correct=0) -> None:
        super().__init__()
        self.nb_correct = nb_correct
        self.nb_pred = nb_pred
        self.nb_true = nb_true

    def __repr__(self) -> str:
        p, r, f = self.prf
        return f"P: {p:.2%} R: {r:.2%} F1: {f:.2%}"

    @property
    def prf(self):
        nb_correct = self.nb_correct
        nb_pred = self.nb_pred
        nb_true = self.nb_true
        p = nb_correct / nb_pred if nb_pred > 0 else .0
        r = nb_correct / nb_true if nb_true > 0 else .0
        f = 2 * p * r / (p + r) if p + r > 0 else .0
        return p, r, f

    @property
    def score(self):
        return self.prf[-1]

    def reset(self):
        self.nb_correct = 0
        self.nb_pred = 0
        self.nb_true = 0

    def __call__(self, pred: set, gold: set):
        self.nb_correct += len(pred & gold)
        self.nb_pred += len(pred)
        self.nb_true += len(gold)


class F1_(Metric):
    def __init__(self, p, r, f) -> None:
        super().__init__()
        self.f = f
        self.r = r
        self.p = p

    @property
    def score(self):
        return self.f

    def __call__(self, pred, gold):
        raise NotImplementedError()

    def reset(self):
        self.f = self.r = self.p = 0

    def __repr__(self) -> str:
        p, r, f = self.p, self.r, self.f
        return f"P: {p:.2%} R: {r:.2%} F1: {f:.2%}"


================================================
FILE: hanlp/metrics/metric.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-03 11:35
from abc import ABC, abstractmethod


class Metric(ABC):

    def __lt__(self, other):
        return self.score < other

    def __le__(self, other):
        return self.score <= other

    def __eq__(self, other):
        return self.score == other

    def __ge__(self, other):
        return self.score >= other

    def __gt__(self, other):
        return self.score > other

    def __ne__(self, other):
        return self.score != other

    @property
    @abstractmethod
    def score(self):
        pass

    @abstractmethod
    def __call__(self, pred, gold, mask=None):
        pass

    def __repr__(self) -> str:
        return f'{self.score}:.4f'

    def __float__(self):
        return self.score

    @abstractmethod
    def reset(self):
        pass


================================================
FILE: hanlp/metrics/mtl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-03 00:16
from hanlp.metrics.metric import Metric


class MetricDict(Metric, dict):
    _COLORS = ["magenta", "cyan", "green", "yellow"]

    @property
    def score(self):
        return sum(float(x) for x in self.values()) / len(self)

    def __call__(self, pred, gold):
        for metric in self.values():
            metric(pred, gold)

    def reset(self):
        for metric in self.values():
            metric.reset()

    def __repr__(self) -> str:
        return ' '.join(f'({k} {v})' for k, v in self.items())

    def cstr(self, idx=None, level=0) -> str:
        if idx is None:
            idx = [0]
        prefix = ''
        for _, (k, v) in enumerate(self.items()):
            color = self._COLORS[idx[0] % len(self._COLORS)]
            idx[0] += 1
            child_is_dict = isinstance(v, MetricDict)
            _level = min(level, 2)
            # if level != 0 and not child_is_dict:
            #     _level = 2
            lb = '{[('
            rb = '}])'
            k = f'[bold][underline]{k}[/underline][/bold]'
            prefix += f'[{color}]{lb[_level]}{k} [/{color}]'
            if child_is_dict:
                prefix += v.cstr(idx, level + 1)
            else:
                prefix += f'[{color}]{v}[/{color}]'
            prefix += f'[{color}]{rb[_level]}[/{color}]'
        return prefix


================================================
FILE: hanlp/metrics/parsing/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-27 00:48

================================================
FILE: hanlp/metrics/parsing/attachmentscore.py
================================================
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from hanlp.metrics.metric import Metric


class AttachmentScore(Metric):

    def __init__(self, eps=1e-12):
        super(AttachmentScore, self).__init__()

        self.eps = eps
        self.total = 0.0
        self.correct_arcs = 0.0
        self.correct_rels = 0.0

    def __repr__(self):
        return f"UAS: {self.uas:.2%} LAS: {self.las:.2%}"

    # noinspection PyMethodOverriding
    def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask):
        arc_mask = arc_preds.eq(arc_golds)[mask]
        rel_mask = rel_preds.eq(rel_golds)[mask] & arc_mask

        self.total += len(arc_mask)
        self.correct_arcs += arc_mask.sum().item()
        self.correct_rels += rel_mask.sum().item()

    def __lt__(self, other):
        return self.score < other

    def __le__(self, other):
        return self.score <= other

    def __ge__(self, other):
        return self.score >= other

    def __gt__(self, other):
        return self.score > other

    @property
    def score(self):
        return self.las

    @property
    def uas(self):
        return self.correct_arcs / (self.total + self.eps)

    @property
    def las(self):
        return self.correct_rels / (self.total + self.eps)

    def reset(self):
        self.total = 0.0
        self.correct_arcs = 0.0
        self.correct_rels = 0.0


================================================
FILE: hanlp/metrics/parsing/conllx_eval.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-08 22:35
import tempfile

from hanlp.utils.io_util import get_resource, get_exitcode_stdout_stderr

CONLLX_EVAL = get_resource(
    'https://github.com/elikip/bist-parser/archive/master.zip' + '#bmstparser/src/utils/eval.pl')


def evaluate(gold_file, pred_file):
    """Evaluate using official CoNLL-X evaluation script (Yuval Krymolowski)

    Args:
      gold_file(str): The gold conllx file
      pred_file(str): The pred conllx file

    Returns:

    
    """
    gold_file = get_resource(gold_file)
    fixed_pred_file = tempfile.NamedTemporaryFile().name
    copy_cols(gold_file, pred_file, fixed_pred_file, keep_comments=False)
    if gold_file.endswith('.conllu'):
        fixed_gold_file = tempfile.NamedTemporaryFile().name
        copy_cols(gold_file, gold_file, fixed_gold_file, keep_comments=False)
        gold_file = fixed_gold_file

    exitcode, out, err = get_exitcode_stdout_stderr(f'perl {CONLLX_EVAL} -q -b -g {gold_file} -s {fixed_pred_file}')
    if exitcode:
        raise RuntimeError(f'eval.pl exited with error code {exitcode} and error message {err} and output {out}.')
    lines = out.split('\n')[-4:]
    las = int(lines[0].split()[3]) / int(lines[0].split()[5])
    uas = int(lines[1].split()[3]) / int(lines[1].split()[5])
    return uas, las


def copy_cols(gold_file, pred_file, copied_pred_file, keep_comments=True):
    """Copy the first 6 columns from gold file to pred file

    Args:
      gold_file: 
      pred_file: 
      copied_pred_file: 
      keep_comments:  (Default value = True)

    Returns:

    
    """
    with open(copied_pred_file, 'w') as to_out, open(pred_file) as pred_file, open(gold_file) as gold_file:
        for idx, (p, g) in enumerate(zip(pred_file, gold_file)):
            while p.startswith('#'):
                p = next(pred_file)
            if not g.strip():
                if p.strip():
                    raise ValueError(
                        f'Prediction file {pred_file.name} does not end a sentence at line {idx + 1}\n{p.strip()}')
                to_out.write('\n')
                continue
            while g.startswith('#') or '-' in g.split('\t')[0]:
                if keep_comments or g.startswith('-'):
                    to_out.write(g)
                g = next(gold_file)
            to_out.write('\t'.join(str(x) for x in g.split('\t')[:6] + p.split('\t')[6:]))


================================================
FILE: hanlp/metrics/parsing/labeled_f1.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-27 21:42

from hanlp.metrics.metric import Metric


class LabeledF1(Metric):

    def __init__(self):
        super(LabeledF1, self).__init__()

        self.sum_gold_arcs_wo_punc = 0.0
        self.sum_pred_arcs_wo_punc = 0.0
        self.correct_arcs_wo_punc = 0.0
        self.correct_rels_wo_punc = 0.0

    def __repr__(self):
        return f"UF: {self.uf:4.2%} LF: {self.lf:4.2%}"

    def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask):
        mask_gold = mask & arc_golds
        mask_pred = mask & arc_preds

        correct_mask = mask_gold & mask_pred
        correct_arcs_wo_punc = (arc_preds == arc_golds)[correct_mask]
        correct_rels_wo_punc = (rel_preds == rel_golds)[correct_mask] & correct_arcs_wo_punc

        self.sum_gold_arcs_wo_punc += float(mask_gold.sum())
        self.sum_pred_arcs_wo_punc += float(mask_pred.sum())
        self.correct_arcs_wo_punc += float(correct_arcs_wo_punc.sum())
        self.correct_rels_wo_punc += float(correct_rels_wo_punc.sum())

    def __lt__(self, other):
        return self.score < other

    def __le__(self, other):
        return self.score <= other

    def __ge__(self, other):
        return self.score >= other

    def __gt__(self, other):
        return self.score > other

    @property
    def score(self):
        return self.las

    @property
    def uas(self):
        return self.uf

    @property
    def las(self):
        return self.lf

    @property
    def ur(self):
        if not self.sum_gold_arcs_wo_punc:
            return .0
        return self.correct_arcs_wo_punc / self.sum_gold_arcs_wo_punc

    @property
    def up(self):
        if not self.sum_pred_arcs_wo_punc:
            return .0
        return self.correct_arcs_wo_punc / self.sum_pred_arcs_wo_punc

    @property
    def lr(self):
        if not self.sum_gold_arcs_wo_punc:
            return .0
        return self.correct_rels_wo_punc / self.sum_gold_arcs_wo_punc

    @property
    def lp(self):
        if not self.sum_pred_arcs_wo_punc:
            return .0
        return self.correct_rels_wo_punc / self.sum_pred_arcs_wo_punc

    @property
    def uf(self):
        rp = self.ur + self.up
        if not rp:
            return .0
        return 2 * self.ur * self.up / rp

    @property
    def lf(self):
        rp = self.lr + self.lp
        if not rp:
            return .0
        return 2 * self.lr * self.lp / rp

    def reset(self):
        self.sum_gold_arcs_wo_punc = 0.0
        self.sum_pred_arcs_wo_punc = 0.0
        self.correct_arcs_wo_punc = 0.0
        self.correct_rels_wo_punc = 0.0

    def to_dict(self) -> dict:
        return {'UF': self.uf, 'LF': self.lf}


================================================
FILE: hanlp/metrics/parsing/labeled_f1_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-27 21:42
import tensorflow as tf


class LabeledF1TF(object):

    def __init__(self):
        super(LabeledF1TF, self).__init__()

        self.sum_gold_arcs_wo_punc = 0.0
        self.sum_pred_arcs_wo_punc = 0.0
        self.correct_arcs_wo_punc = 0.0
        self.correct_rels_wo_punc = 0.0

    def __repr__(self):
        return f"UF: {self.uf:6.2%} LF: {self.lf:6.2%}"

    def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask):
        mask = mask.unsqueeze(-1).expand_as(arc_preds)
        mask = mask & mask.transpose(1, 2)

        mask_gold = mask & arc_golds
        mask_pred = mask & arc_preds
        correct_arcs_wo_punc = (arc_preds == arc_golds)[mask_gold & mask_pred]
        correct_rels_wo_punc = (rel_preds == rel_golds)[mask_gold & mask_pred] & correct_arcs_wo_punc

        self.sum_gold_arcs_wo_punc += float(tf.math.count_nonzero(mask_gold))
        self.sum_pred_arcs_wo_punc += float(tf.math.count_nonzero(mask_pred))
        self.correct_arcs_wo_punc += float(tf.math.count_nonzero(correct_arcs_wo_punc))
        self.correct_rels_wo_punc += float(tf.math.count_nonzero(correct_rels_wo_punc))

    def __lt__(self, other):
        return self.score < other

    def __le__(self, other):
        return self.score <= other

    def __ge__(self, other):
        return self.score >= other

    def __gt__(self, other):
        return self.score > other

    @property
    def score(self):
        return self.las

    @property
    def uas(self):
        return self.uf

    @property
    def las(self):
        return self.lf

    @property
    def ur(self):
        if not self.sum_gold_arcs_wo_punc:
            return 0
        return self.correct_arcs_wo_punc / self.sum_gold_arcs_wo_punc

    @property
    def up(self):
        if not self.sum_pred_arcs_wo_punc:
            return 0
        return self.correct_arcs_wo_punc / self.sum_pred_arcs_wo_punc

    @property
    def lr(self):
        if not self.sum_gold_arcs_wo_punc:
            return 0
        return self.correct_rels_wo_punc / self.sum_gold_arcs_wo_punc

    @property
    def lp(self):
        if not self.sum_pred_arcs_wo_punc:
            return 0
        return self.correct_rels_wo_punc / self.sum_pred_arcs_wo_punc

    @property
    def uf(self):
        rp = self.ur + self.up
        if not rp:
            return 0
        return 2 * self.ur * self.up / rp

    @property
    def lf(self):
        rp = self.lr + self.lp
        if not rp:
            return 0
        return 2 * self.lr * self.lp / rp

    def reset_states(self):
        self.sum_gold_arcs_wo_punc = 0.0
        self.sum_pred_arcs_wo_punc = 0.0
        self.correct_arcs_wo_punc = 0.0
        self.correct_rels_wo_punc = 0.0

    def to_dict(self) -> dict:
        return {'UF': self.uf, 'LF': self.lf}


================================================
FILE: hanlp/metrics/parsing/labeled_score.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-27 00:49

import tensorflow as tf


class LabeledScore(object):

    def __init__(self, eps=1e-5):
        super(LabeledScore, self).__init__()

        self.eps = eps
        self.total = 0.0
        self.correct_arcs = 0.0
        self.correct_rels = 0.0

    def __repr__(self):
        return f"UAS: {self.uas:6.2%} LAS: {self.las:6.2%}"

    def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask):
        arc_mask = (arc_preds == arc_golds)[mask]
        rel_mask = (rel_preds == rel_golds)[mask] & arc_mask

        self.total += len(arc_mask)
        self.correct_arcs += int(tf.math.count_nonzero(arc_mask))
        self.correct_rels += int(tf.math.count_nonzero(rel_mask))

    def __lt__(self, other):
        return self.score < other

    def __le__(self, other):
        return self.score <= other

    def __ge__(self, other):
        return self.score >= other

    def __gt__(self, other):
        return self.score > other

    @property
    def score(self):
        return self.las

    @property
    def uas(self):
        return self.correct_arcs / (self.total + self.eps)

    @property
    def las(self):
        return self.correct_rels / (self.total + self.eps)

    def reset_states(self):
        self.total = 0.0
        self.correct_arcs = 0.0
        self.correct_rels = 0.0

    def to_dict(self) -> dict:
        return {'UAS': self.uas, 'LAS': self.las}


================================================
FILE: hanlp/metrics/parsing/semdep_eval.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright 2017 Timothy Dozat
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import codecs
import sys
from collections import namedtuple


# ===============================================================
def sdp_eval(gold_files, sys_files, labeled=False):
    """Modified from https://github.com/tdozat/Parser-v3/blob/2ff4061373e8aac8c962537a6220e1d5b196abf6/scripts/semdep_eval.py
    Dozat claimed "I tested it against the official eval script and it reported identical LF1".

    Args:
      gold_files: 
      sys_files: 
      labeled:  (Default value = False)

    Returns:

    
    """

    correct = 0
    predicted = 0
    actual = 0
    n_tokens = 0
    n_sequences = 0
    current_seq_correct = False
    n_correct_sequences = 0
    current_sent = 0
    if isinstance(gold_files, str):
        gold_files = [gold_files]
    if isinstance(sys_files, str):
        sys_files = [sys_files]

    for gold_file, sys_file in zip(gold_files, sys_files):
        with codecs.open(gold_file, encoding='utf-8') as gf, \
                codecs.open(sys_file, encoding='utf-8') as sf:
            gold_line = gf.readline()
            gold_i = 1
            sys_i = 0
            while gold_line:
                while gold_line.startswith('#'):
                    current_sent += 1
                    gold_i += 1
                    n_sequences += 1
                    n_correct_sequences += current_seq_correct
                    current_seq_correct = True
                    gold_line = gf.readline()
                if gold_line.rstrip() != '':
                    sys_line = sf.readline()
                    sys_i += 1
                    while sys_line.startswith('#') or sys_line.rstrip() == '' or sys_line.split('\t')[0] == '0':
                        sys_line = sf.readline()
                        sys_i += 1

                    gold_line = gold_line.rstrip().split('\t')
                    sys_line = sys_line.rstrip().split('\t')
                    # assert sys_line[1] == gold_line[1], 'Files are misaligned at lines {}, {}'.format(gold_i, sys_i)

                    # Compute the gold edges
                    gold_node = gold_line[8]
                    if gold_node != '_':
                        gold_node = gold_node.split('|')
                        if labeled:
                            gold_edges = set(tuple(gold_edge.split(':', 1)) for gold_edge in gold_node)
                        else:
                            gold_edges = set(gold_edge.split(':', 1)[0] for gold_edge in gold_node)
                    else:
                        gold_edges = set()

                    # Compute the sys edges
                    sys_node = sys_line[8]
                    if sys_node != '_':
                        sys_node = sys_node.split('|')
                        if labeled:
                            sys_edges = set(tuple(sys_edge.split(':', 1)) for sys_edge in sys_node)
                        else:
                            sys_edges = set(sys_edge.split(':', 1)[0] for sys_edge in sys_node)
                    else:
                        sys_edges = set()

                    correct_edges = gold_edges & sys_edges
                    if len(correct_edges) != len(gold_edges):
                        current_seq_correct = False
                    correct += len(correct_edges)
                    predicted += len(sys_edges)
                    actual += len(gold_edges)
                    n_tokens += 1
                    # current_fp += len(sys_edges) - len(gold_edges & sys_edges)
                gold_line = gf.readline()
                gold_i += 1
    # print(correct, predicted - correct, actual - correct)
    Accuracy = namedtuple('Accuracy', ['precision', 'recall', 'F1', 'seq_acc'])
    precision = correct / (predicted + 1e-12)
    recall = correct / (actual + 1e-12)
    F1 = 2 * precision * recall / (precision + recall + 1e-12)
    seq_acc = n_correct_sequences / n_sequences
    return Accuracy(precision, recall, F1, seq_acc)


# ===============================================================
def main():
    """ """

    files = sys.argv[1:]
    n_files = len(files)
    assert (n_files % 2) == 0
    gold_files, sys_files = files[:n_files // 2], files[n_files // 2:]
    UAS = sdp_eval(gold_files, sys_files, labeled=False)
    LAS = sdp_eval(gold_files, sys_files, labeled=True)
    # print(UAS.F1, UAS.seq_acc)
    print('UAS={:0.1f}'.format(UAS.F1 * 100))
    print('LAS={:0.1f}'.format(LAS.F1 * 100))


if __name__ == '__main__':
    main()


================================================
FILE: hanlp/metrics/parsing/span.py
================================================
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from collections import Counter

from hanlp.metrics.metric import Metric


class SpanMetric(Metric):

    def __init__(self, eps=1e-12):
        super().__init__()
        self.reset(eps)

    # noinspection PyAttributeOutsideInit
    def reset(self, eps=1e-12):
        self.n = 0.0
        self.n_ucm = 0.0
        self.n_lcm = 0.0
        self.utp = 0.0
        self.ltp = 0.0
        self.pred = 0.0
        self.gold = 0.0
        self.eps = eps

    def __call__(self, preds, golds):
        for pred, gold in zip(preds, golds):
            upred = Counter([(i, j) for i, j, label in pred])
            ugold = Counter([(i, j) for i, j, label in gold])
            utp = list((upred & ugold).elements())
            lpred = Counter(pred)
            lgold = Counter(gold)
            ltp = list((lpred & lgold).elements())
            self.n += 1
            self.n_ucm += len(utp) == len(pred) == len(gold)
            self.n_lcm += len(ltp) == len(pred) == len(gold)
            self.utp += len(utp)
            self.ltp += len(ltp)
            self.pred += len(pred)
            self.gold += len(gold)
        return self

    def __repr__(self):
        s = f"UCM: {self.ucm:.2%} LCM: {self.lcm:.2%} "
        s += f"UP: {self.up:.2%} UR: {self.ur:.2%} UF: {self.uf:.2%} "
        s += f"LP: {self.lp:.2%} LR: {self.lr:.2%} LF: {self.lf:.2%}"

        return s

    @property
    def score(self):
        return self.lf

    @property
    def ucm(self):
        return self.n_ucm / (self.n + self.eps)

    @property
    def lcm(self):
        return self.n_lcm / (self.n + self.eps)

    @property
    def up(self):
        return self.utp / (self.pred + self.eps)

    @property
    def ur(self):
        return self.utp / (self.gold + self.eps)

    @property
    def uf(self):
        return 2 * self.utp / (self.pred + self.gold + self.eps)

    @property
    def lp(self):
        return self.ltp / (self.pred + self.eps)

    @property
    def lr(self):
        return self.ltp / (self.gold + self.eps)

    @property
    def lf(self):
        return 2 * self.ltp / (self.pred + self.gold + self.eps)


================================================
FILE: hanlp/metrics/spearman_correlation.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-23 16:12
import torch

from hanlp.metrics.metric import Metric


def _get_ranks(x: torch.Tensor) -> torch.Tensor:
    argsort = x.argsort()
    ranks = torch.zeros_like(argsort, device=x.device)
    ranks[argsort] = torch.arange(len(x), device=x.device)
    return ranks


def spearman_correlation(x: torch.Tensor, y: torch.Tensor):
    """Compute correlation between 2 1-D vectors. Adopted from
    https://discuss.pytorch.org/t/spearmans-correlation/91931/5

    Args:
        x: Shape (N, )
        y: Shape (N, )

    """
    x_rank = _get_ranks(x)
    y_rank = _get_ranks(y)

    n = x.size(0)
    upper = 6 * torch.sum((x_rank - y_rank).pow(2))
    down = n * (n ** 2 - 1.0)
    return 1.0 - (upper / down)


class SpearmanCorrelation(Metric):
    """
    This `Metric` calculates the sample Spearman correlation coefficient (r)
    between two tensors. Each element in the two tensors is assumed to be
    a different observation of the variable (i.e., the input tensors are
    implicitly flattened into vectors and the correlation is calculated
    between the vectors).

    <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>
    """

    @property
    def score(self):
        return spearman_correlation(self.total_predictions, self.total_gold_labels).item()

    def __init__(self) -> None:
        super().__init__()
        self.total_predictions = torch.zeros(0)
        self.total_gold_labels = torch.zeros(0)

    def __call__(
            self,
            predictions: torch.Tensor,
            gold_labels: torch.Tensor,
            mask=None
    ):
        """
        # Parameters

        predictions : `torch.Tensor`, required.
            A tensor of predictions of shape (batch_size, ...).
        gold_labels : `torch.Tensor`, required.
            A tensor of the same shape as `predictions`.
        """
        if mask is not None:
            raise NotImplemented('mask not supported in SpearmanCorrelation for now.')
        # Flatten predictions, gold_labels, and mask. We calculate the Spearman correlation between
        # the vectors, since each element in the predictions and gold_labels tensor is assumed
        # to be a separate observation.
        predictions = predictions.reshape(-1)
        gold_labels = gold_labels.reshape(-1)

        self.total_predictions = self.total_predictions.to(predictions.device)
        self.total_gold_labels = self.total_gold_labels.to(gold_labels.device)
        self.total_predictions = torch.cat((self.total_predictions, predictions), 0)
        self.total_gold_labels = torch.cat((self.total_gold_labels, gold_labels), 0)

    def reset(self):
        self.total_predictions = torch.zeros(0)
        self.total_gold_labels = torch.zeros(0)

    def __str__(self) -> str:
        return f'spearman: {self.score * 100:.2f}'


================================================
FILE: hanlp/metrics/srl/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-16 18:44

================================================
FILE: hanlp/metrics/srl/srlconll.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-16 18:44
import os

from hanlp.utils.io_util import get_resource, get_exitcode_stdout_stderr, run_cmd


def official_conll_05_evaluate(pred_path, gold_path):
    script_root = get_resource('http://www.lsi.upc.edu/~srlconll/srlconll-1.1.tgz')
    lib_path = f'{script_root}/lib'
    if lib_path not in os.environ.get("PERL5LIB", ""):
        os.environ['PERL5LIB'] = f'{lib_path}:{os.environ.get("PERL5LIB", "")}'
    bin_path = f'{script_root}/bin'
    if bin_path not in os.environ.get('PATH', ''):
        os.environ['PATH'] = f'{bin_path}:{os.environ.get("PATH", "")}'
    eval_info_gold_pred = run_cmd(f'perl {script_root}/bin/srl-eval.pl {gold_path} {pred_path}')
    eval_info_pred_gold = run_cmd(f'perl {script_root}/bin/srl-eval.pl {pred_path} {gold_path}')
    conll_recall = float(eval_info_gold_pred.strip().split("\n")[6].strip().split()[5]) / 100
    conll_precision = float(eval_info_pred_gold.strip().split("\n")[6].strip().split()[5]) / 100
    if conll_recall + conll_precision > 0:
        conll_f1 = 2 * conll_recall * conll_precision / (conll_recall + conll_precision)
    else:
        conll_f1 = 0
    return conll_precision, conll_recall, conll_f1


def run_perl(script, src, dst=None):
    os.environ['PERL5LIB'] = f''
    exitcode, out, err = get_exitcode_stdout_stderr(
        f'perl -I{os.path.expanduser("~/.local/lib/perl5")} {script} {src}')
    if exitcode:
        # cpanm -l ~/.local namespace::autoclean
        # cpanm -l ~/.local Moose
        # cpanm -l ~/.local MooseX::SemiAffordanceAccessor module
        raise RuntimeError(err)
    with open(dst, 'w') as ofile:
        ofile.write(out)
    return dst


================================================
FILE: hanlp/optimizers/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-11-11 18:44

================================================
FILE: hanlp/optimizers/adamw/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-11-11 18:44
import tensorflow as tf
from hanlp.optimizers.adamw.optimization import WarmUp, AdamWeightDecay


# from hanlp.optimization.adamw.optimizers_v2 import AdamW
# from hanlp.optimization.adamw.utils import get_weight_decays


# def create_optimizer(model, init_lr, num_train_steps, num_warmup_steps):
#     """Creates an optimizer with learning rate schedule."""
#     wd_dict = get_weight_decays(model)
#
#     # Implements linear decay of the learning rate.
#     learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
#         initial_learning_rate=init_lr,
#         decay_steps=num_train_steps,
#         end_learning_rate=0.0)
#     if num_warmup_steps:
#         learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
#                                   decay_schedule_fn=learning_rate_fn,
#                                   warmup_steps=num_warmup_steps)
#     optimizer = AdamW(
#         learning_rate=learning_rate_fn,
#         weight_decay_rate=0.01,
#         beta_1=0.9,
#         beta_2=0.999,
#         epsilon=1e-6,
#         exclude_from_weight_decay=['layer_norm', 'bias'])
#     return optimizer


def create_optimizer(init_lr, num_train_steps, num_warmup_steps, weight_decay_rate=0.01, epsilon=1e-6, clipnorm=None):
    """Creates an optimizer with learning rate schedule.

    Args:
      init_lr: 
      num_train_steps: 
      num_warmup_steps: 
      weight_decay_rate:  (Default value = 0.01)
      epsilon:  (Default value = 1e-6)
      clipnorm:  (Default value = None)

    Returns:

    """
    # Implements linear decay of the learning rate.
    learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=init_lr,
        decay_steps=num_train_steps,
        end_learning_rate=0.0)
    if num_warmup_steps:
        learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
                                  decay_schedule_fn=learning_rate_fn,
                                  warmup_steps=num_warmup_steps)
    additional_args = {}
    if clipnorm:
        additional_args['clipnorm'] = clipnorm
    optimizer = AdamWeightDecay(
        learning_rate=learning_rate_fn,
        weight_decay_rate=weight_decay_rate,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=epsilon,
        exclude_from_weight_decay=['LayerNorm', 'bias'],
        **additional_args
    )
    # {'LayerNorm/gamma:0', 'LayerNorm/beta:0'}
    return optimizer


================================================
FILE: hanlp/optimizers/adamw/optimization.py
================================================
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions and classes related to optimization (weight updates)."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import re

import tensorflow as tf


class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
    """Applys a warmup schedule on a given learning rate decay schedule."""

    def __init__(
            self,
            initial_learning_rate,
            decay_schedule_fn,
            warmup_steps,
            power=1.0,
            name=None):
        super(WarmUp, self).__init__()
        self.initial_learning_rate = initial_learning_rate
        self.warmup_steps = warmup_steps
        self.power = power
        self.decay_schedule_fn = decay_schedule_fn
        self.name = name

    def __call__(self, step):
        with tf.name_scope(self.name or 'WarmUp') as name:
            # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
            # learning rate will be `global_step/num_warmup_steps * init_lr`.
            global_step_float = tf.cast(step, tf.float32)
            warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
            warmup_percent_done = global_step_float / warmup_steps_float
            warmup_learning_rate = (
                    self.initial_learning_rate *
                    tf.math.pow(warmup_percent_done, self.power))
            return tf.cond(global_step_float < warmup_steps_float,
                           lambda: warmup_learning_rate,
                           lambda: self.decay_schedule_fn(step),
                           name=name)

    def get_config(self):
        return {
            'initial_learning_rate': self.initial_learning_rate,
            'decay_schedule_fn': self.decay_schedule_fn,
            'warmup_steps': self.warmup_steps,
            'power': self.power,
            'name': self.name
        }


def create_optimizer(init_lr, num_train_steps, num_warmup_steps):
    """Creates an optimizer with learning rate schedule.

    Args:
      init_lr: 
      num_train_steps: 
      num_warmup_steps: 

    Returns:

    """
    # Implements linear decay of the learning rate.
    learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=init_lr,
        decay_steps=num_train_steps,
        end_learning_rate=0.0)
    if num_warmup_steps:
        learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
                                  decay_schedule_fn=learning_rate_fn,
                                  warmup_steps=num_warmup_steps)
    optimizer = AdamWeightDecay(
        learning_rate=learning_rate_fn,
        weight_decay_rate=0.01,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=['layer_norm', 'bias'])
    return optimizer


try:
    AdamTF = tf.keras.optimizers.legacy.Adam  # avoid slowdown when using v2.11+ Keras optimizers on M1/M2 Macs
except:
    AdamTF = tf.keras.optimizers.Adam


class AdamWeightDecay(AdamTF):
    """Adam enables L2 weight decay and clip_by_global_norm on gradients.
    
      Just adding the square of the weights to the loss function is *not* the
      correct way of using L2 regularization/weight decay with Adam, since that will
      interact with the m and v parameters in strange ways.
    
      Instead we want to decay the weights in a manner that doesn't interact with
      the m/v parameters. This is equivalent to adding the square of the weights to
      the loss with plain (non-momentum) SGD.

    Args:

    Returns:

    """

    def __init__(self,
                 learning_rate=0.001,
                 beta_1=0.9,
                 beta_2=0.999,
                 epsilon=1e-7,
                 amsgrad=False,
                 weight_decay_rate=0.0,
                 include_in_weight_decay=None,
                 exclude_from_weight_decay=None,
                 name='AdamWeightDecay',
                 **kwargs):
        super(AdamWeightDecay, self).__init__(
            learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
        self.weight_decay_rate = weight_decay_rate
        self._include_in_weight_decay = include_in_weight_decay
        self._exclude_from_weight_decay = exclude_from_weight_decay

    @classmethod
    def from_config(cls, config):
        """Creates an optimizer from its config with WarmUp custom object.

        Args:
          config:

        Returns:

        """
        custom_objects = {'WarmUp': WarmUp}
        return super(AdamWeightDecay, cls).from_config(
            config, custom_objects=custom_objects)

    def _prepare_local(self, var_device, var_dtype, apply_state):
        super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
                                                    apply_state)
        apply_state['weight_decay_rate'] = tf.constant(
            self.weight_decay_rate, name='adam_weight_decay_rate')

    def _decay_weights_op(self, var, learning_rate, apply_state):
        do_decay = self._do_use_weight_decay(var.name)
        if do_decay:
            return var.assign_sub(
                learning_rate * var *
                apply_state['weight_decay_rate'],
                use_locking=self._use_locking)
        return tf.no_op()

    def apply_gradients(self, grads_and_vars, name=None):
        grads, tvars = list(zip(*grads_and_vars))
        (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
        return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars))

    def _get_lr(self, var_device, var_dtype, apply_state):
        """Retrieves the learning rate with the given state.

        Args:
          var_device:
          var_dtype:
          apply_state:

        Returns:

        """
        if apply_state is None:
            return self._decayed_lr_t[var_dtype], {}

        apply_state = apply_state or {}
        coefficients = apply_state.get((var_device, var_dtype))
        if coefficients is None:
            coefficients = self._fallback_apply_state(var_device, var_dtype)
            apply_state[(var_device, var_dtype)] = coefficients

        return coefficients['lr_t'], dict(apply_state=apply_state)

    def _resource_apply_dense(self, grad, var, apply_state=None):
        lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
        decay = self._decay_weights_op(var, lr_t, apply_state)
        with tf.control_dependencies([decay]):
            return super(AdamWeightDecay, self)._resource_apply_dense(
                grad, var, **kwargs)

    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
        lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
        decay = self._decay_weights_op(var, lr_t, apply_state)
        with tf.control_dependencies([decay]):
            return super(AdamWeightDecay, self)._resource_apply_sparse(
                grad, var, indices, **kwargs)

    def get_config(self):
        config = super(AdamWeightDecay, self).get_config()
        config.update({
            'weight_decay_rate': self.weight_decay_rate,
        })
        return config

    def _do_use_weight_decay(self, param_name):
        """Whether to use L2 weight decay for `param_name`.

        Args:
          param_name:

        Returns:

        """
        if self.weight_decay_rate == 0:
            return False

        if self._include_in_weight_decay:
            for r in self._include_in_weight_decay:
                if re.search(r, param_name) is not None:
                    return True

        if self._exclude_from_weight_decay:
            for r in self._exclude_from_weight_decay:
                if re.search(r, param_name) is not None:
                    return False
        return True

    def apply_gradients(self, grads_and_vars, name=None, **kwargs):
        grads, tvars = list(zip(*grads_and_vars))
        return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars), name=name, **kwargs)


================================================
FILE: hanlp/pretrained/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 19:10
from hanlp.pretrained import tok
from hanlp.pretrained import dep
from hanlp.pretrained import sdp
from hanlp.pretrained import glove
from hanlp.pretrained import pos
from hanlp.pretrained import rnnlm
from hanlp.pretrained import word2vec
from hanlp.pretrained import ner
from hanlp.pretrained import classifiers
from hanlp.pretrained import fasttext
from hanlp.pretrained import mtl
from hanlp.pretrained import eos
from hanlp.pretrained import sts
from hanlp.pretrained import constituency
from hanlp.pretrained import amr
from hanlp.pretrained import amr2text
from hanlp.pretrained import srl

# Will be filled up during runtime
ALL = {}


================================================
FILE: hanlp/pretrained/amr.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-01-25 11:47
from hanlp_common.constant import HANLP_URL

AMR3_SEQ2SEQ_BART_LARGE = HANLP_URL + 'amr/amr3_seq2seq_bart_large_83.30_20220125_114450.zip'
'''A seq2seq (:cite:`bevilacqua-etal-2021-one`) BART (:cite:`lewis-etal-2020-bart`) large parser trained on Abstract 
Meaning Representation 3.0 (:cite:`knight2014abstract`). Its performance is

 =================== ========= ========= ========= 
  Metric              P         R         F1       
 =================== ========= ========= ========= 
  Smatch              84.00     82.60     83.30    
  Unlabeled           86.40     84.90     85.70    
  No WSD              84.50     83.10     83.80    
  Non_sense_frames    91.90     91.30     91.60    
  Wikification        81.70     80.80     81.20    
  Named Ent.          89.20     87.00     88.10    
  Negations           71.70     70.90     71.30    
  IgnoreVars          73.80     73.10     73.50    
  Concepts            90.70     89.60     90.10    
  Frames              88.50     87.90     88.20    
  Reentrancies        70.40     71.80     71.10    
  SRL                 79.00     79.60     79.30    
 =================== ========= ========= ========= 
    
Note this parser does NOT perform wikification.
'''

AMR3_GRAPH_PRETRAIN_PARSER = HANLP_URL + 'amr/amr3_graph_pretrain_parser_20221207_153759.zip'
'''A seq2seq (:cite:`bevilacqua-etal-2021-one`) BART (:cite:`lewis-etal-2020-bart`) large parser trained on Abstract 
Meaning Representation 3.0 (:cite:`knight2014abstract`) with graph pre-training (:cite:`bai-etal-2022-graph`). 
Its performance is ``84.3`` according to their official repository. Using ``amr-evaluation-enhanced``, the performance is
slightly lower:

 =================== ========= ========= ========= 
  Metric              P         R         F1       
 =================== ========= ========= ========= 
  Smatch             84.4       83.6        84.0       
  Unlabeled          86.7       85.8        86.2       
  No WSD             84.9       84.1        84.5       
  Non_sense_frames   91.8       91.6        91.7       
  Wikification       83.6       81.7        82.6       
  Named Ent.         89.3       87.4        88.4       
  Negations          71.6       72.2        71.9       
  IgnoreVars         74.6       74.2        74.4       
  Concepts           90.7       90.0        90.3       
  Frames             88.8       88.5        88.7       
  Reentrancies       72.1       72.9        72.5       
  SRL                80.1       80.7        80.4      
 =================== ========= ========= ========= 
    
Note this parser does NOT perform wikification.
'''

MRP2020_AMR_ENG_ZHO_XLM_BASE = 'http://download.hanlp.com/amr/extra/amr-eng-zho-xlm-roberta-base_20220412_223756.zip'
'''A wrapper for the Permutation-invariant Semantic Parser (:cite:`samuel-straka-2020-ufal`) trained on MRP2020 English 
and Chinese AMR corpus. It was ranked the top in the MRP2020 competition, while this release is a base version. 
See the original paper for the detailed performance. Note this model requires tokens and lemmas (for English) to be 
provided as inputs. 
'''

MRP2020_AMR_ZHO_MENGZI_BASE = 'http://download.hanlp.com/amr/extra/amr-zho-mengzi-base_20220415_101941.zip'
'''A Chinese Permutation-invariant Semantic Parser (:cite:`samuel-straka-2020-ufal`) trained on MRP2020  
Chinese AMR corpus using Mengzi BERT base (:cite:`zhang2021mengzi`). Its performance on dev set is 
``{amr-zho [tops F1: 85.43%][anchors F1: 93.41%][labels F1: 87.68%][properties F1: 82.02%][edges F1: 73.17%]
[attributes F1: 0.00%][all F1: 84.11%]}``. Test set performance is unknown since the test set is not released to the 
public. 
'''

# Will be filled up during runtime
ALL = {}


================================================
FILE: hanlp/pretrained/amr2text.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-12-07 15:19
from hanlp_common.constant import HANLP_URL

AMR3_GRAPH_PRETRAIN_GENERATION = HANLP_URL + 'amr2text/amr3_graph_pretrain_generation_20221207_153535.zip'
'''A seq2seq (:cite:`bevilacqua-etal-2021-one`) BART (:cite:`lewis-etal-2020-bart`) large AMR2Text generator trained on 
Abstract Meaning Representation 3.0 (:cite:`knight2014abstract`) with graph pre-training (:cite:`bai-etal-2022-graph`). 
Its Sacre-BLEU is ``50.38`` according to their official repository.
'''

# Will be filled up during runtime
ALL = {}


================================================
FILE: hanlp/pretrained/classifiers.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-01 03:51
from hanlp_common.constant import HANLP_URL

CHNSENTICORP_BERT_BASE_ZH = HANLP_URL + 'classification/chnsenticorp_bert_base_20211228_163210.zip'
SST2_ALBERT_BASE_EN = HANLP_URL + 'classification/sst2_albert_base_20211228_164917.zip'

LID_176_FASTTEXT_BASE = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'
'''
126MB FastText model for language identification trained on data from Wikipedia, Tatoeba and SETimes.
'''
LID_176_FASTTEXT_SMALL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'
'''
917kB FastText model for language identification trained on data from Wikipedia, Tatoeba and SETimes.
'''

ALL = {}


================================================
FILE: hanlp/pretrained/constituency.py
================================================
# -*- coding:utf-8 -*-
# Author=hankcs
# Date=2022-01-18 10:34
from hanlp_common.constant import HANLP_URL

CTB9_CON_ELECTRA_SMALL = HANLP_URL + 'constituency/ctb9_con_electra_small_20220215_230116.zip'
'Electra (:cite:`clark2020electra`) small tree CRF model (:cite:`ijcai2020-560`) trained on CTB9 with major categories. ' \
'Its performance is UCM=39.06% LCM=34.99% UP=90.05% UR=90.01% UF=90.03% LP=87.02% LR=86.98% LF=87.00%.'

CTB9_CON_FULL_TAG_ELECTRA_SMALL = HANLP_URL + 'constituency/ctb9_full_tag_con_electra_small_20220118_103119.zip'
'Electra (:cite:`clark2020electra`) small tree CRF model (:cite:`ijcai2020-560`) trained on CTB9 with full subcategories. ' \
'Its performance is UCM=38.29% LCM=28.95% UP=90.16% UR=90.13% UF=90.15% LP=83.46% LR=83.43% LF=83.45%.'

CTB9_CON_FULL_TAG_ERNIE_GRAM = 'http://download.hanlp.com/constituency/extra/ctb9_full_tag_con_ernie_20220331_121430.zip'
'ERNIE-GRAM (:cite:`xiao-etal-2021-ernie`) base tree CRF model (:cite:`ijcai2020-560`) trained on CTB9 with full subcategories. ' \
'Its performance is UCM=42.04% LCM=31.72% UP=91.33% UR=91.53% UF=91.43% LP=85.31% LR=85.49% LF=85.40%.'

# Will be filled up during runtime
ALL = {}


================================================
FILE: hanlp/pretrained/dep.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 02:55
from hanlp_common.constant import HANLP_URL

CTB5_BIAFFINE_DEP_ZH = HANLP_URL + 'dep/biaffine_ctb5_20191229_025833.zip'
'Biaffine LSTM model (:cite:`dozat:17a`) trained on CTB5.'
CTB7_BIAFFINE_DEP_ZH = HANLP_URL + 'dep/biaffine_ctb7_20200109_022431.zip'
'Biaffine LSTM model (:cite:`dozat:17a`) trained on CTB7.'
CTB9_DEP_ELECTRA_SMALL = HANLP_URL + 'dep/ctb9_dep_electra_small_20220216_100306.zip'
'Electra small encoder (:cite:`clark2020electra`) with Biaffine decoder (:cite:`dozat:17a`) trained on CTB9-SD330. ' \
'Performance is UAS=87.68% LAS=83.54%.'
PMT1_DEP_ELECTRA_SMALL = HANLP_URL + 'dep/pmt_dep_electra_small_20220218_134518.zip'
'Electra small encoder (:cite:`clark2020electra`) with Biaffine decoder (:cite:`dozat:17a`) trained on PKU ' \
'Multi-view Chinese Treebank (PMT) 1.0 (:cite:`qiu-etal-2014-multi`). Performance is UAS=91.21% LAS=88.65%.'
CTB9_UDC_ELECTRA_SMALL = HANLP_URL + 'dep/udc_dep_electra_small_20220218_095452.zip'
'Electra small encoder (:cite:`clark2020electra`) with Biaffine decoder (:cite:`dozat:17a`) trained on CTB9-UD420. ' \
'Performance is UAS=85.92% LAS=81.13% .'

PTB_BIAFFINE_DEP_EN = HANLP_URL + 'dep/ptb_dep_biaffine_20200101_174624.zip'
'Biaffine LSTM model (:cite:`dozat:17a`) trained on PTB.'

ALL = {}


================================================
FILE: hanlp/pretrained/eos.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-22 13:22
from hanlp_common.constant import HANLP_URL

UD_CTB_EOS_MUL = HANLP_URL + 'eos/eos_ud_ctb_mul_20201222_133543.zip'
'EOS model (:cite:`Schweter:Ahmed:2019`) trained on concatenated UD2.3 and CTB9.'

# Will be filled up during runtime
ALL = {}


================================================
FILE: hanlp/pretrained/fasttext.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-30 18:57
FASTTEXT_DEBUG_EMBEDDING_EN = 'https://elit-models.s3-us-west-2.amazonaws.com/fasttext.debug.bin.zip'
FASTTEXT_CC_300_EN = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz'
'FastText (:cite:`bojanowski2017enriching`) embeddings trained on Common Crawl.'
FASTTEXT_WIKI_NYT_AMAZON_FRIENDS_200_EN \
    = 'https://elit-models.s3-us-west-2.amazonaws.com/fasttext-200-wikipedia-nytimes-amazon-friends-20191107.bin'
'FastText (:cite:`bojanowski2017enriching`) embeddings trained on wikipedia, nytimes and friends.'

FASTTEXT_WIKI_300_ZH = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh.zip#wiki.zh.bin'
'FastText (:cite:`bojanowski2017enriching`) embeddings trained on Chinese Wikipedia.'
FASTTEXT_WIKI_300_ZH_CLASSICAL = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_classical.zip#wiki.zh_classical.bin'
'FastText (:cite:`bojanowski2017enriching`) embeddings trained on traditional Chinese wikipedia.'

ALL = {}


================================================
FILE: hanlp/pretrained/glove.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-27 20:42

_GLOVE_6B_ROOT = 'http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip'

GLOVE_6B_50D = _GLOVE_6B_ROOT + '#' + 'glove.6B.50d.txt'
'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 50d trained on 6B tokens.'
GLOVE_6B_100D = _GLOVE_6B_ROOT + '#' + 'glove.6B.100d.txt'
'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 100d trained on 6B tokens.'
GLOVE_6B_200D = _GLOVE_6B_ROOT + '#' + 'glove.6B.200d.txt'
'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 200d trained on 6B tokens.'
GLOVE_6B_300D = _GLOVE_6B_ROOT + '#' + 'glove.6B.300d.txt'
'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 300d trained on 6B tokens.'

GLOVE_840B_300D = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 300d trained on 840B tokens.'

ALL = {}


================================================
FILE: hanlp/pretrained/mtl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-22 13:16
from hanlp_common.constant import HANLP_URL

OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH = HANLP_URL + 'mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip'
"Electra (:cite:`clark2020electra`) small version of joint tok, pos, ner, srl, dep, sdp and con model trained on open-source Chinese corpus."
OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH = HANLP_URL + 'mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip'
"Electra (:cite:`clark2020electra`) base version of joint tok, pos, ner, srl, dep, sdp and con model trained on open-source Chinese corpus."
CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH = HANLP_URL + 'mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip'
"Electra (:cite:`clark2020electra`) small version of joint tok, pos, ner, srl, dep (SD Standard), sdp and con model trained on close-source Chinese corpus."
CLOSE_TOK_POS_NER_SRL_UDEP_SDP_CON_ELECTRA_SMALL_ZH = HANLP_URL + 'mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20220626_175100.zip'
'''
Electra (:cite:`clark2020electra`) small version of joint tok, pos, ner, srl, dep (UD Standard), sdp and con model trained on close-source Chinese corpus.
Performance: ``{con UCM: 39.33% LCM: 35.69% UP: 90.24% UR: 90.28% UF: 90.26% LP: 87.55% LR: 87.59% LF: 87.57%}{dep UAS: 86.80% LAS: 82.82%}{ner/msra P: 95.45% R: 96.65% F1: 96.05%}{ner/ontonotes P: 75.98% R: 79.09% F1: 77.50%}{ner/pku P: 95.77% R: 96.75% F1: 96.26%}{pos/863 Accuracy:94.83%}{pos/ctb Accuracy:96.57%}{pos/pku Accuracy:97.54%}{sdp UF: 85.55% LF: 73.67%}{srl P: 75.71% R: 74.25% F1: 74.97%}{tok/coarse P: 97.77% R: 97.70% F1: 97.74%}{tok/fine P: 97.44% R: 97.32% F1: 97.38%}``.
'''
CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH = HANLP_URL + 'mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip'
"Electra (:cite:`clark2020electra`) base version of joint tok, pos, ner, srl, dep, sdp and con model trained on close-source Chinese corpus."
CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH = HANLP_URL + 'mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip'
"ERNIE (:cite:`xiao-etal-2021-ernie`) base version of joint tok, pos, ner, srl, dep, sdp and con model trained on close-source Chinese corpus."

UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L6 = HANLP_URL + 'mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mMiniLMv2L6_no_space_20220731_161526.zip'
'''
mMiniLMv2 (:cite:`wang-etal-2021-minilmv2`) L6xH384 small version of joint tok, pos, lem, fea, ner, srl, dep, sdp and con model trained on UD 2.10 and OntoNotes5 corpora.
The following 130 languages are supported: ``Afrikaans, Akkadian, Akuntsu, Albanian, Amharic, AncientGreek (to 1453), Ancient Hebrew, Apurinã, Arabic, Armenian, AssyrianNeo-Aramaic, Bambara, Basque, Beja, Belarusian, Bengali, Bhojpuri, Breton, Bulgarian, Catalan, Cebuano, Central Siberian Yupik, Chinese, Chukot, ChurchSlavic, Coptic, Croatian, Czech, Danish, Dutch, Emerillon, English, Erzya, Estonian, Faroese, Finnish, French, Galician, German, Gothic, Guajajára, Guarani, Hebrew, Hindi, Hittite, Hungarian, Icelandic, Indonesian, Irish, Italian, Japanese, Javanese, K\'iche\', Kangri, Karelian, Karo(Brazil), Kazakh, Khunsari, Komi-Permyak, Komi-Zyrian, Korean, Latin, Latvian, Ligurian, LiteraryChinese, Lithuanian, Livvi, LowGerman, Madi, Makuráp, Maltese, Manx, Marathi, MbyáGuaraní, Modern Greek (1453-), Moksha, Mundurukú, Nayini, Neapolitan, Nigerian Pidgin, NorthernKurdish, Northern Sami, Norwegian, OldFrench (842-ca. 1400), OldRussian, Old Turkish, Persian, Polish, Portuguese, Romanian, Russia Buriat, Russian, Sanskrit, ScottishGaelic, Serbian, SkoltSami, Slovak, Slovenian, Soi, South Levantine Arabic, Spanish, Swedish, SwedishSign Language, SwissGerman, Tagalog, Tamil, Tatar, Telugu, Thai, Tupinambá, Turkish, Uighur, Ukrainian, Umbrian, UpperSorbian, Urdu, Urubú-Kaapor, Vietnamese, Warlpiri, Welsh, Western Armenian, WesternFrisian, Wolof, Xibe, Yakut, Yoruba, YueChinese``.
Performance: ``{con UCM: 15.48% LCM: 11.45% UP: 68.92% UR: 66.88% UF: 67.88% LP: 61.19% LR: 59.38% LF: 60.27%}{ner P: 76.06% R: 77.83% F1: 76.93%}{sdp/dm UF: 91.84% LF: 91.00%}{sdp/pas UF: 95.46% LF: 93.90%}{sdp/psd UF: 91.94% LF: 81.26%}{srl [predicate P: 91.71% R: 74.51% F1: 82.22%][e2e P: 77.48% R: 55.28% F1: 64.52%]}{tok P: 93.17% R: 93.53% F1: 93.35%}{ud [lemmas Accuracy:81.74%][upos Accuracy:85.94%][deps UAS: 80.60% LAS: 71.21%][feats Accuracy:77.17%]}``.
'''
UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L12 = HANLP_URL + 'mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mMiniLMv2L12_no_space_20220807_133143.zip'
'''
mMiniLMv2 (:cite:`wang-etal-2021-minilmv2`) L6xH384 base version of joint tok, pos, lem, fea, ner, srl, dep, sdp and con model trained on UD 2.10 and OntoNotes5 corpora.
The following 130 languages are supported: ``Afrikaans, Akkadian, Akuntsu, Albanian, Amharic, AncientGreek (to 1453), Ancient Hebrew, Apurinã, Arabic, Armenian, AssyrianNeo-Aramaic, Bambara, Basque, Beja, Belarusian, Bengali, Bhojpuri, Breton, Bulgarian, Catalan, Cebuano, Central Siberian Yupik, Chinese, Chukot, ChurchSlavic, Coptic, Croatian, Czech, Danish, Dutch, Emerillon, English, Erzya, Estonian, Faroese, Finnish, French, Galician, German, Gothic, Guajajára, Guarani, Hebrew, Hindi, Hittite, Hungarian, Icelandic, Indonesian, Irish, Italian, Japanese, Javanese, K\'iche\', Kangri, Karelian, Karo(Brazil), Kazakh, Khunsari, Komi-Permyak, Komi-Zyrian, Korean, Latin, Latvian, Ligurian, LiteraryChinese, Lithuanian, Livvi, LowGerman, Madi, Makuráp, Maltese, Manx, Marathi, MbyáGuaraní, Modern Greek (1453-), Moksha, Mundurukú, Nayini, Neapolitan, Nigerian Pidgin, NorthernKurdish, Northern Sami, Norwegian, OldFrench (842-ca. 1400), OldRussian, Old Turkish, Persian, Polish, Portuguese, Romanian, Russia Buriat, Russian, Sanskrit, ScottishGaelic, Serbian, SkoltSami, Slovak, Slovenian, Soi, South Levantine Arabic, Spanish, Swedish, SwedishSign Language, SwissGerman, Tagalog, Tamil, Tatar, Telugu, Thai, Tupinambá, Turkish, Uighur, Ukrainian, Umbrian, UpperSorbian, Urdu, Urubú-Kaapor, Vietnamese, Warlpiri, Welsh, Western Armenian, WesternFrisian, Wolof, Xibe, Yakut, Yoruba, YueChinese``.
Performance: ``{con UCM: 17.32% LCM: 13.28% UP: 70.53% UR: 68.73% UF: 69.62% LP: 63.03% LR: 61.42% LF: 62.22%}{ner P: 76.91% R: 78.72% F1: 77.80%}{sdp/dm UF: 92.78% LF: 92.02%}{sdp/pas UF: 96.43% LF: 95.02%}{sdp/psd UF: 92.75% LF: 81.86%}{srl [predicate P: 91.82% R: 77.57% F1: 84.10%][e2e P: 78.33% R: 59.14% F1: 67.40%]}{tok P: 93.69% R: 94.34% F1: 94.02%}{ud [lemmas Accuracy:82.48%][upos Accuracy:87.09%][deps UAS: 82.41% LAS: 73.69%][feats Accuracy:78.58%]}``.
'''
UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE = HANLP_URL + 'mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20220608_003435.zip'
'''
XLM-R (:cite:`conneau-etal-2020-unsupervised`) base version of joint tok, pos, lem, fea, ner, srl, dep, sdp and con model trained on UD 2.10 and OntoNotes5 corpora.
The following 130 languages are supported: ``Afrikaans, Akkadian, Akuntsu, Albanian, Amharic, AncientGreek (to 1453), Ancient Hebrew, Apurinã, Arabic, Armenian, AssyrianNeo-Aramaic, Bambara, Basque, Beja, Belarusian, Bengali, Bhojpuri, Breton, Bulgarian, Catalan, Cebuano, Central Siberian Yupik, Chinese, Chukot, ChurchSlavic, Coptic, Croatian, Czech, Danish, Dutch, Emerillon, English, Erzya, Estonian, Faroese, Finnish, French, Galician, German, Gothic, Guajajára, Guarani, Hebrew, Hindi, Hittite, Hungarian, Icelandic, Indonesian, Irish, Italian, Japanese, Javanese, K\'iche\', Kangri, Karelian, Karo(Brazil), Kazakh, Khunsari, Komi-Permyak, Komi-Zyrian, Korean, Latin, Latvian, Ligurian, LiteraryChinese, Lithuanian, Livvi, LowGerman, Madi, Makuráp, Maltese, Manx, Marathi, MbyáGuaraní, Modern Greek (1453-), Moksha, Mundurukú, Nayini, Neapolitan, Nigerian Pidgin, NorthernKurdish, Northern Sami, Norwegian, OldFrench (842-ca. 1400), OldRussian, Old Turkish, Persian, Polish, Portuguese, Romanian, Russia Buriat, Russian, Sanskrit, ScottishGaelic, Serbian, SkoltSami, Slovak, Slovenian, Soi, South Levantine Arabic, Spanish, Swedish, SwedishSign Language, SwissGerman, Tagalog, Tamil, Tatar, Telugu, Thai, Tupinambá, Turkish, Uighur, Ukrainian, Umbrian, UpperSorbian, Urdu, Urubú-Kaapor, Vietnamese, Warlpiri, Welsh, Western Armenian, WesternFrisian, Wolof, Xibe, Yakut, Yoruba, YueChinese``.
Performance: ``{con UCM: 20.31% LCM: 16.82% UP: 77.50% UR: 76.63% UF: 77.06% LP: 71.25% LR: 70.46% LF: 70.85%}{ner P: 79.93% R: 80.76% F1: 80.34%}{sdp/dm UF: 93.71% LF: 93.00%}{sdp/pas UF: 97.63% LF: 96.37%}{sdp/psd UF: 93.08% LF: 80.95%}{srl [predicate P: 90.95% R: 84.25% F1: 87.47%][e2e P: 78.89% R: 67.32% F1: 72.65%]}{tok P: 98.50% R: 98.70% F1: 98.60%}{ud [lemmas Accuracy:85.95%][upos Accuracy:89.95%][deps UAS: 85.78% LAS: 78.51%][feats Accuracy:82.18%]}``.
'''

NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA = HANLP_URL + 'mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'
'BERT (:cite:`devlin-etal-2019-bert`) base char encoder trained on NPCMJ/UD/Kyoto corpora with decoders including tok, pos, ner, dep, con, srl.'

# Will be filled up during runtime
ALL = {}


================================================
FILE: hanlp/pretrained/ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-30 20:07
from hanlp_common.constant import HANLP_URL

MSRA_NER_BERT_BASE_ZH = HANLP_URL + 'ner/ner_bert_base_msra_20211227_114712.zip'
'BERT model (:cite:`devlin-etal-2019-bert`) trained on MSRA with 3 entity types.'
MSRA_NER_ALBERT_BASE_ZH = HANLP_URL + 'ner/msra_ner_albert_base_20211228_173323.zip'
'ALBERT model (:cite:`Lan2020ALBERT:`) trained on MSRA with 3 entity types.'
MSRA_NER_ELECTRA_SMALL_ZH = HANLP_URL + 'ner/msra_ner_electra_small_20220215_205503.zip'
'Electra small model (:cite:`clark2020electra`) trained on MSRA with 26 entity types. F1 = `95.16`'
CONLL03_NER_BERT_BASE_CASED_EN = HANLP_URL + 'ner/ner_conll03_bert_base_cased_en_20211227_121443.zip'
'BERT model (:cite:`devlin-etal-2019-bert`) trained on CoNLL03.'

ALL = {}


================================================
FILE: hanlp/pretrained/pos.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 01:57
from hanlp_common.constant import HANLP_URL

CTB5_POS_RNN = HANLP_URL + 'pos/ctb5_pos_rnn_20200113_235925.zip'
'An old school BiLSTM tagging model trained on CTB5.'
CTB5_POS_RNN_FASTTEXT_ZH = HANLP_URL + 'pos/ctb5_pos_rnn_fasttext_20191230_202639.zip'
'An old school BiLSTM tagging model with FastText (:cite:`bojanowski2017enriching`) embeddings trained on CTB5.'
CTB9_POS_ALBERT_BASE = HANLP_URL + 'pos/ctb9_albert_base_20211228_163935.zip'
'ALBERT model (:cite:`Lan2020ALBERT:`) trained on CTB9 (:cite:`https://doi.org/10.35111/gvd0-xk91`). This is a TF component.'
CTB9_POS_ELECTRA_SMALL_TF = HANLP_URL + 'pos/pos_ctb_electra_small_20211227_121341.zip'
'Electra small model (:cite:`clark2020electra`) trained on CTB9 (:cite:`https://doi.org/10.35111/gvd0-xk91`). Accuracy = `96.75`. This is a TF component.'
CTB9_POS_ELECTRA_SMALL = HANLP_URL + 'pos/pos_ctb_electra_small_20220215_111944.zip'
'Electra small model (:cite:`clark2020electra`) trained on CTB9 (:cite:`https://doi.org/10.35111/gvd0-xk91`). Accuracy = `96.26`.'
CTB9_POS_RADICAL_ELECTRA_SMALL = HANLP_URL + 'pos/pos_ctb_radical_electra_small_20220215_111932.zip'
'Electra small model (:cite:`clark2020electra`) with radical embeddings (:cite:`he2018dual`) trained on CTB9 (:cite:`https://doi.org/10.35111/gvd0-xk91`). Accuracy = `96.14`.'
C863_POS_ELECTRA_SMALL = HANLP_URL + 'pos/pos_863_electra_small_20220217_101958.zip'
'Electra small model (:cite:`clark2020electra`) trained on Chinese 863 corpus. Accuracy = `95.19`.'
PKU_POS_ELECTRA_SMALL = HANLP_URL + 'pos/pos_pku_electra_small_20220217_142436.zip'
'Electra small model (:cite:`clark2020electra`) trained on Chinese PKU corpus. Accuracy = `97.55`.'
PTB_POS_RNN_FASTTEXT_EN = HANLP_URL + 'pos/ptb_pos_rnn_fasttext_20220418_101708.zip'
'An old school BiLSTM tagging model with FastText (:cite:`bojanowski2017enriching`) embeddings trained on PTB.'

ALL = {}


================================================
FILE: hanlp/pretrained/rnnlm.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-19 03:47
from hanlp_common.constant import HANLP_URL

FLAIR_LM_FW_WMT11_EN_TF = HANLP_URL + 'lm/flair_lm_wmt11_en_20200211_091932.zip#flair_lm_fw_wmt11_en'
'The forward LSTM of Contextual String Embedding (:cite:`akbik-etal-2018-contextual`).'
FLAIR_LM_BW_WMT11_EN_TF = HANLP_URL + 'lm/flair_lm_wmt11_en_20200211_091932.zip#flair_lm_bw_wmt11_en'
'The backward LSTM of Contextual String Embedding (:cite:`akbik-etal-2018-contextual`).'
FLAIR_LM_WMT11_EN = HANLP_URL + 'lm/flair_lm_wmt11_en_20200601_205350.zip'
'The BiLSTM of Contextual String Embedding (:cite:`akbik-etal-2018-contextual`).'

ALL = {}


================================================
FILE: hanlp/pretrained/sdp.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-31 23:54
from hanlp_common.constant import HANLP_URL

SEMEVAL16_NEWS_BIAFFINE_ZH = HANLP_URL + 'sdp/semeval16-news-biaffine_20191231_235407.zip'
'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 news data.'
SEMEVAL16_TEXT_BIAFFINE_ZH = HANLP_URL + 'sdp/semeval16-text-biaffine_20200101_002257.zip'
'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 text data.'

SEMEVAL16_ALL_ELECTRA_SMALL_ZH = HANLP_URL + 'sdp/semeval16_sdp_electra_small_20220719_171433.zip'
'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 text and news data. Performance: ``UF: 83.03% LF: 72.58%``'

SEMEVAL15_PAS_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_pas_20200103_152405.zip'
'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 PAS data.'
SEMEVAL15_PSD_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_psd_20200106_123009.zip'
'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 PSD data.'
SEMEVAL15_DM_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_dm_20200106_122808.zip'
'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 DM data.'

ALL = {}


================================================
FILE: hanlp/pretrained/srl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-08-07 19:07
from hanlp_common.constant import HANLP_URL

CPB3_SRL_ELECTRA_SMALL = HANLP_URL + 'srl/cpb3_electra_small_crf_has_transform_20220218_135910.zip'
'Electra small model (:cite:`clark2020electra`) trained on CPB3. P=75.87% R=76.24% F1=76.05%.'

ALL = {}


================================================
FILE: hanlp/pretrained/sts.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-24 12:51
from hanlp_common.constant import HANLP_URL

STS_ELECTRA_BASE_ZH = HANLP_URL + 'sts/sts_electra_base_zh_20210530_200109.zip'
'A naive regression model trained on concatenated STS corpora.'

# Will be filled up during runtime
ALL = {}


================================================
FILE: hanlp/pretrained/tok.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 21:12
from hanlp_common.constant import HANLP_URL

SIGHAN2005_PKU_CONVSEG = HANLP_URL + 'tok/sighan2005-pku-convseg_20200110_153722.zip'
'Conv model (:cite:`wang-xu-2017-convolutional`) trained on sighan2005 pku dataset.'
SIGHAN2005_MSR_CONVSEG = HANLP_URL + 'tok/convseg-msr-nocrf-noembed_20200110_153524.zip'
'Conv model (:cite:`wang-xu-2017-convolutional`) trained on sighan2005 msr dataset.'
CTB6_CONVSEG = HANLP_URL + 'tok/ctb6_convseg_nowe_nocrf_20200110_004046.zip'
'Conv model (:cite:`wang-xu-2017-convolutional`) trained on CTB6 dataset.'
PKU_NAME_MERGED_SIX_MONTHS_CONVSEG = HANLP_URL + 'tok/pku98_6m_conv_ngram_20200110_134736.zip'
'Conv model (:cite:`wang-xu-2017-convolutional`) trained on pku98 six months dataset with familiy name and given name merged into one unit.'
LARGE_ALBERT_BASE = HANLP_URL + 'tok/large_corpus_cws_albert_base_20211228_160926.zip'
'ALBERT model (:cite:`Lan2020ALBERT:`) trained on the largest CWS dataset in the world.'
SIGHAN2005_PKU_BERT_BASE_ZH = HANLP_URL + 'tok/sighan2005_pku_bert_base_zh_20201231_141130.zip'
'BERT model (:cite:`devlin-etal-2019-bert`) trained on sighan2005 pku dataset.'
COARSE_ELECTRA_SMALL_ZH = HANLP_URL + 'tok/coarse_electra_small_20220616_012050.zip'
'Electra (:cite:`clark2020electra`) small model trained on coarse-grained CWS corpora. Its performance is ``P: 98.34% R: 98.38% F1: 98.36%`` which is ' \
'much higher than that of MTL model '
FINE_ELECTRA_SMALL_ZH = HANLP_URL + 'tok/fine_electra_small_20220615_231803.zip'
'Electra (:cite:`clark2020electra`) small model trained on fine-grained CWS corpora. Its performance is ``P: 98.14% R: 98.07% F1: 98.11%`` which is ' \
'much higher than that of MTL model '
CTB9_TOK_ELECTRA_SMALL = HANLP_URL + 'tok/ctb9_electra_small_20220215_205427.zip'
'Electra (:cite:`clark2020electra`) small model trained on CTB9. Its performance is P=97.15% R=97.36% F1=97.26% which is ' \
'much higher than that of MTL model '
CTB9_TOK_ELECTRA_BASE = 'http://download.hanlp.com/tok/extra/ctb9_tok_electra_base_20220426_111949.zip'
'Electra (:cite:`clark2020electra`) base model trained on CTB9. Its performance is ``P: 97.62% R: 97.67% F1: 97.65%`` ' \
'which is much higher than that of MTL model '
CTB9_TOK_ELECTRA_BASE_CRF = 'http://download.hanlp.com/tok/extra/ctb9_tok_electra_base_crf_20220426_161255.zip'
'Electra (:cite:`clark2020electra`) base model trained on CTB9. Its performance is ``P: 97.68% R: 97.71% F1: 97.69%`` ' \
'which is much higher than that of MTL model '
MSR_TOK_ELECTRA_BASE_CRF = 'http://download.hanlp.com/tok/extra/msra_crf_electra_base_20220507_113936.zip'
'Electra (:cite:`clark2020electra`) base model trained on MSR CWS dataset. Its performance is ``P: 98.71% R: 98.64% F1: 98.68%`` ' \
'which is much higher than that of MTL model '

UD_TOK_MMINILMV2L6 = HANLP_URL + 'tok/ud_tok_mMiniLMv2L6_no_space_mul_20220619_091824.zip'
'''
mMiniLMv2 (:cite:`wang-etal-2021-minilmv2`) L6xH384 based tokenizer trained on UD 2.10.
The following 130 languages are supported: ``Afrikaans, Akkadian, Akuntsu, Albanian, Amharic, AncientGreek (to 1453), Ancient Hebrew, Apurinã, Arabic, Armenian, AssyrianNeo-Aramaic, Bambara, Basque, Beja, Belarusian, Bengali, Bhojpuri, Breton, Bulgarian, Catalan, Cebuano, Central Siberian Yupik, Chinese, Chukot, ChurchSlavic, Coptic, Croatian, Czech, Danish, Dutch, Emerillon, English, Erzya, Estonian, Faroese, Finnish, French, Galician, German, Gothic, Guajajára, Guarani, Hebrew, Hindi, Hittite, Hungarian, Icelandic, Indonesian, Irish, Italian, Japanese, Javanese, K\'iche\', Kangri, Karelian, Karo(Brazil), Kazakh, Khunsari, Komi-Permyak, Komi-Zyrian, Korean, Latin, Latvian, Ligurian, LiteraryChinese, Lithuanian, Livvi, LowGerman, Madi, Makuráp, Maltese, Manx, Marathi, MbyáGuaraní, Modern Greek (1453-), Moksha, Mundurukú, Nayini, Neapolitan, Nigerian Pidgin, NorthernKurdish, Northern Sami, Norwegian, OldFrench (842-ca. 1400), OldRussian, Old Turkish, Persian, Polish, Portuguese, Romanian, Russia Buriat, Russian, Sanskrit, ScottishGaelic, Serbian, SkoltSami, Slovak, Slovenian, Soi, South Levantine Arabic, Spanish, Swedish, SwedishSign Language, SwissGerman, Tagalog, Tamil, Tatar, Telugu, Thai, Tupinambá, Turkish, Uighur, Ukrainian, Umbrian, UpperSorbian, Urdu, Urubú-Kaapor, Vietnamese, Warlpiri, Welsh, Western Armenian, WesternFrisian, Wolof, Xibe, Yakut, Yoruba, YueChinese``.
Performance: ``P: 94.99% R: 94.74% F1: 94.86%``.
'''
UD_TOK_MMINILMV2L12 = HANLP_URL + 'tok/ud_tok_mMiniLMv2L12_no_space_mul_20220619_091159.zip'
'''
mMiniLMv2 (:cite:`wang-etal-2021-minilmv2`) L12xH384 based tokenizer trained on UD 2.10.
The following 130 languages are supported: ``Afrikaans, Akkadian, Akuntsu, Albanian, Amharic, AncientGreek (to 1453), Ancient Hebrew, Apurinã, Arabic, Armenian, AssyrianNeo-Aramaic, Bambara, Basque, Beja, Belarusian, Bengali, Bhojpuri, Breton, Bulgarian, Catalan, Cebuano, Central Siberian Yupik, Chinese, Chukot, ChurchSlavic, Coptic, Croatian, Czech, Danish, Dutch, Emerillon, English, Erzya, Estonian, Faroese, Finnish, French, Galician, German, Gothic, Guajajára, Guarani, Hebrew, Hindi, Hittite, Hungarian, Icelandic, Indonesian, Irish, Italian, Japanese, Javanese, K\'iche\', Kangri, Karelian, Karo(Brazil), Kazakh, Khunsari, Komi-Permyak, Komi-Zyrian, Korean, Latin, Latvian, Ligurian, LiteraryChinese, Lithuanian, Livvi, LowGerman, Madi, Makuráp, Maltese, Manx, Marathi, MbyáGuaraní, Modern Greek (1453-), Moksha, Mundurukú, Nayini, Neapolitan, Nigerian Pidgin, NorthernKurdish, Northern Sami, Norwegian, OldFrench (842-ca. 1400), OldRussian, Old Turkish, Persian, Polish, Portuguese, Romanian, Russia Buriat, Russian, Sanskrit, ScottishGaelic, Serbian, SkoltSami, Slovak, Slovenian, Soi, South Levantine Arabic, Spanish, Swedish, SwedishSign Language, SwissGerman, Tagalog, Tamil, Tatar, Telugu, Thai, Tupinambá, Turkish, Uighur, Ukrainian, Umbrian, UpperSorbian, Urdu, Urubú-Kaapor, Vietnamese, Warlpiri, Welsh, Western Armenian, WesternFrisian, Wolof, Xibe, Yakut, Yoruba, YueChinese``.
Performance: ``P: 95.41% R: 95.25% F1: 95.33%``.
'''

# Will be filled up during runtime
ALL = {}


================================================
FILE: hanlp/pretrained/word2vec.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 18:25
from hanlp_common.constant import HANLP_URL

CONVSEG_W2V_NEWS_TENSITE = HANLP_URL + 'embeddings/convseg_embeddings.zip'
CONVSEG_W2V_NEWS_TENSITE_WORD_PKU = CONVSEG_W2V_NEWS_TENSITE + '#news_tensite.pku.words.w2v50'
CONVSEG_W2V_NEWS_TENSITE_WORD_MSR = CONVSEG_W2V_NEWS_TENSITE + '#news_tensite.msr.words.w2v50'
CONVSEG_W2V_NEWS_TENSITE_CHAR = CONVSEG_W2V_NEWS_TENSITE + '#news_tensite.w2v200'

SEMEVAL16_EMBEDDINGS_CN = HANLP_URL + 'embeddings/semeval16_embeddings.zip'
SEMEVAL16_EMBEDDINGS_300_NEWS_CN = SEMEVAL16_EMBEDDINGS_CN + '#news.fasttext.300.txt'
SEMEVAL16_EMBEDDINGS_300_TEXT_CN = SEMEVAL16_EMBEDDINGS_CN + '#text.fasttext.300.txt'

CTB5_FASTTEXT_300_CN = HANLP_URL + 'embeddings/ctb.fasttext.300.txt.zip'

TENCENT_AILAB_EMBEDDING_SMALL_200 = 'https://ai.tencent.com/ailab/nlp/en/data/tencent-ailab-embedding-zh-d200-v0.2.0-s.tar.gz#tencent-ailab-embedding-zh-d200-v0.2.0-s.txt'
'Chinese word embeddings (:cite:`NIPS2013_9aa42b31`) with small vocabulary size and 200 dimension provided by Tencent AI lab.'
TENCENT_AILAB_EMBEDDING_LARGE_200 = 'https://ai.tencent.com/ailab/nlp/en/data/tencent-ailab-embedding-zh-d200-v0.2.0.tar.gz#tencent-ailab-embedding-zh-d200-v0.2.0.txt'
'Chinese word embeddings (:cite:`NIPS2013_9aa42b31`) with large vocabulary size and 200 dimension provided by Tencent AI lab.'
TENCENT_AILAB_EMBEDDING_SMALL_100 = 'https://ai.tencent.com/ailab/nlp/en/data/tencent-ailab-embedding-zh-d100-v0.2.0-s.tar.gz#tencent-ailab-embedding-zh-d100-v0.2.0-s.txt'
'Chinese word embeddings (:cite:`NIPS2013_9aa42b31`) with small vocabulary size and 100 dimension provided by Tencent AI lab.'
TENCENT_AILAB_EMBEDDING_LARGE_100 = 'https://ai.tencent.com/ailab/nlp/en/data/tencent-ailab-embedding-zh-d100-v0.2.0.tar.gz#tencent-ailab-embedding-zh-d100-v0.2.0.txt'
'Chinese word embeddings (:cite:`NIPS2013_9aa42b31`) with large vocabulary size and 100 dimension provided by Tencent AI lab.'

MERGE_SGNS_BIGRAM_CHAR_300_ZH = 'http://download.hanlp.com/embeddings/extra/merge_sgns_bigram_char300_20220130_214613.txt.zip'
'Chinese word embeddings trained with context features (word, ngram, character, and more) using Skip-Gram with Negative Sampling (SGNS) (:cite:`li-etal-2018-analogical`).'

RADICAL_CHAR_EMBEDDING_100 = HANLP_URL + 'embeddings/radical_char_vec_20191229_013849.zip#character.vec.txt'
'Chinese character embedding enhanced with rich radical information (:cite:`he2018dual`).'

_SUBWORD_ENCODING_CWS = 'http://download.hanlp.com/embeddings/extra/subword_encoding_cws_20200524_190636.zip'
SUBWORD_ENCODING_CWS_ZH_WIKI_BPE_50 = _SUBWORD_ENCODING_CWS + '#zh.wiki.bpe.vs200000.d50.w2v.txt'
SUBWORD_ENCODING_CWS_GIGAWORD_UNI = _SUBWORD_ENCODING_CWS + '#gigaword_chn.all.a2b.uni.ite50.vec'
SUBWORD_ENCODING_CWS_GIGAWORD_BI = _SUBWORD_ENCODING_CWS + '#gigaword_chn.all.a2b.bi.ite50.vec'
SUBWORD_ENCODING_CWS_CTB_GAZETTEER_50 = _SUBWORD_ENCODING_CWS + '#ctb.50d.vec'

ALL = {}


================================================
FILE: hanlp/transform/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 22:24

================================================
FILE: hanlp/transform/conll_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-08 15:30
from abc import abstractmethod
from collections import Counter
from typing import Union, Tuple, Iterable, Any, Generator

import numpy as np
import tensorflow as tf
from transformers import PreTrainedTokenizer, PretrainedConfig

from hanlp_common.constant import ROOT
from hanlp_common.structure import SerializableDict
from hanlp.common.transform_tf import Transform
from hanlp.common.vocab_tf import VocabTF
from hanlp.components.parsers.alg_tf import tolist, kmeans, randperm, arange
from hanlp.components.parsers.conll import read_conll
from hanlp_common.conll import CoNLLWord, CoNLLUWord, CoNLLSentence
from hanlp.layers.transformers.utils_tf import config_is, adjust_tokens_for_transformers, convert_examples_to_features
from hanlp.utils.log_util import logger
from hanlp.utils.string_util import ispunct
from hanlp_common.util import merge_locals_kwargs


class CoNLLTransform(Transform):

    def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32, min_freq=2,
                 use_pos=True, **kwargs) -> None:
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.form_vocab: VocabTF = None
        if use_pos:
            self.cpos_vocab: VocabTF = None
        self.rel_vocab: VocabTF = None
        self.puncts: tf.Tensor = None

    @property
    def use_pos(self):
        return self.config.get('use_pos', True)

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        form, cpos = x
        return self.form_vocab.token_to_idx_table.lookup(form), self.cpos_vocab.token_to_idx_table.lookup(cpos)

    def y_to_idx(self, y):
        head, rel = y
        return head, self.rel_vocab.token_to_idx_table.lookup(rel)

    def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
        if len(X) == 2:
            form_batch, cposes_batch = X
            mask = tf.not_equal(form_batch, 0)
        elif len(X) == 3:
            form_batch, cposes_batch, mask = X
        else:
            raise ValueError(f'Expect X to be 2 or 3 elements but got {repr(X)}')
        sents = []

        for form_sent, cposes_sent, length in zip(form_batch, cposes_batch,
                                                  tf.math.count_nonzero(mask, axis=-1)):
            forms = tolist(form_sent)[1:length + 1]
            cposes = tolist(cposes_sent)[1:length + 1]
            sents.append([(self.form_vocab.idx_to_token[f],
                           self.cpos_vocab.idx_to_token[c]) for f, c in zip(forms, cposes)])

        return sents

    def lock_vocabs(self):
        super().lock_vocabs()
        self.puncts = tf.constant([i for s, i in self.form_vocab.token_to_idx.items()
                                   if ispunct(s)], dtype=tf.int64)

    def file_to_inputs(self, filepath: str, gold=True):
        assert gold, 'only support gold file for now'
        use_pos = self.use_pos
        conllu = filepath.endswith('.conllu')
        for sent in read_conll(filepath):
            for i, cell in enumerate(sent):
                form = cell[1]
                cpos = cell[3]
                head = cell[6]
                deprel = cell[7]
                # if conllu:
                #     deps = cell[8]
                #     deps = [x.split(':', 1) for x in deps.split('|')]
                #     heads = [int(x[0]) for x in deps if '_' not in x[0] and '.' not in x[0]]
                #     rels = [x[1] for x in deps if '_' not in x[0] and '.' not in x[0]]
                #     if head in heads:
                #         offset = heads.index(head)
                #         if not self.rel_vocab or rels[offset] in self.rel_vocab:
                #             deprel = rels[offset]
                sent[i] = [form, cpos, head, deprel] if use_pos else [form, head, deprel]
            yield sent

    @property
    def bos(self):
        if self.form_vocab.idx_to_token is None:
            return ROOT
        return self.form_vocab.idx_to_token[2]

    def input_is_single_sample(self, input: Any) -> bool:
        if self.use_pos:
            return isinstance(input[0][0], str) if len(input[0]) else False
        else:
            return isinstance(input[0], str) if len(input[0]) else False

    @abstractmethod
    def batched_inputs_to_batches(self, corpus, indices, shuffle):
        pass

    def len_of_sent(self, sent):
        return 1 + len(sent)  # take ROOT into account

    def samples_to_dataset(self, samples: Generator, map_x=None, map_y=None, batch_size=5000, shuffle=None, repeat=None,
                           drop_remainder=False, prefetch=1, cache=True) -> tf.data.Dataset:
        if shuffle:
            def generator():
                # custom bucketing, load corpus into memory
                corpus = list(x for x in (samples() if callable(samples) else samples))
                lengths = [self.len_of_sent(i) for i in corpus]
                if len(corpus) < 32:
                    n_buckets = 1
                else:
                    n_buckets = min(self.config.n_buckets, len(corpus))
                buckets = dict(zip(*kmeans(lengths, n_buckets)))
                sizes, buckets = zip(*[
                    (size, bucket) for size, bucket in buckets.items()
                ])
                # the number of chunks in each bucket, which is clipped by
                # range [1, len(bucket)]
                chunks = [min(len(bucket), max(round(size * len(bucket) / batch_size), 1)) for size, bucket in
                          zip(sizes, buckets)]
                range_fn = randperm if shuffle else arange
                max_samples_per_batch = self.config.get('max_samples_per_batch', None)
                for i in tolist(range_fn(len(buckets))):
                    split_sizes = [(len(buckets[i]) - j - 1) // chunks[i] + 1
                                   for j in range(chunks[i])]  # how many sentences in each batch
                    for batch_indices in tf.split(range_fn(len(buckets[i])), split_sizes):
                        indices = [buckets[i][j] for j in tolist(batch_indices)]
                        if max_samples_per_batch:
                            for j in range(0, len(indices), max_samples_per_batch):
                                yield from self.batched_inputs_to_batches(corpus, indices[j:j + max_samples_per_batch],
                                                                          shuffle)
                        else:
                            yield from self.batched_inputs_to_batches(corpus, indices, shuffle)

        else:
            def generator():
                # custom bucketing, load corpus into memory
                corpus = list(x for x in (samples() if callable(samples) else samples))
                n_tokens = 0
                batch = []
                for idx, sent in enumerate(corpus):
                    sent_len = self.len_of_sent(sent)
                    if n_tokens + sent_len > batch_size and batch:
                        yield from self.batched_inputs_to_batches(corpus, batch, shuffle)
                        n_tokens = 0
                        batch = []
                    n_tokens += sent_len
                    batch.append(idx)
                if batch:
                    yield from self.batched_inputs_to_batches(corpus, batch, shuffle)

        # next(generator())
        return Transform.samples_to_dataset(self, generator, False, False, 0, False, repeat, drop_remainder, prefetch,
                                            cache)


class CoNLL_DEP_Transform(CoNLLTransform):

    def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32,
                 min_freq=2, **kwargs) -> None:
        super().__init__(config, map_x, map_y, lower, n_buckets, min_freq, **kwargs)

    def batched_inputs_to_batches(self, corpus, indices, shuffle):
        """Convert batched inputs to batches of samples

        Args:
          corpus(list): A list of inputs
          indices(list): A list of indices, each list belongs to a batch
          shuffle:

        Returns:


        """
        raw_batch = [[], [], [], []]
        for idx in indices:
            for b in raw_batch:
                b.append([])
            for cells in corpus[idx]:
                for b, c, v in zip(raw_batch, cells,
                                   [self.form_vocab, self.cpos_vocab, None, self.rel_vocab]):
                    b[-1].append(v.get_idx_without_add(c) if v else c)
        batch = []
        for b, v in zip(raw_batch, [self.form_vocab, self.cpos_vocab, None, self.rel_vocab]):
            b = tf.keras.preprocessing.sequence.pad_sequences(b, padding='post',
                                                              value=v.safe_pad_token_idx if v else 0,
                                                              dtype='int64')
            batch.append(b)
        assert len(batch) == 4
        yield (batch[0], batch[1]), (batch[2], batch[3])

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = (tf.int64, tf.int64), (tf.int64, tf.int64)
        shapes = ([None, None], [None, None]), ([None, None], [None, None])
        values = (self.form_vocab.safe_pad_token_idx, self.cpos_vocab.safe_pad_token_idx), (
            0, self.rel_vocab.safe_pad_token_idx)
        return types, shapes, values

    def inputs_to_samples(self, inputs, gold=False):
        token_mapping: dict = self.config.get('token_mapping', None)
        use_pos = self.config.get('use_pos', True)
        for sent in inputs:
            sample = []
            for i, cell in enumerate(sent):
                if isinstance(cell, tuple):
                    cell = list(cell)
                elif isinstance(cell, str):
                    cell = [cell]
                if token_mapping:
                    cell[0] = token_mapping.get(cell[0], cell[0])
                if self.config['lower']:
                    cell[0] = cell[0].lower()
                if not gold:
                    cell += [0, self.rel_vocab.safe_pad_token]
                sample.append(cell)
            # insert root word with arbitrary fields, anyway it will be masked
            # form, cpos, head, deprel = sample[0]
            sample.insert(0, [self.bos, self.bos, 0, self.bos] if use_pos else [self.bos, 0, self.bos])
            yield sample

    def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                             gold=False, inputs=None, conll=True, arc_scores=None, rel_scores=None) -> Iterable:
        (words, feats, mask), (arc_preds, rel_preds) = X, Y
        if inputs is None:
            inputs = self.X_to_inputs(X)
        ys = self.Y_to_outputs((arc_preds, rel_preds, mask), inputs=inputs)
        sents = []
        for x, y in zip(inputs, ys):
            sent = CoNLLSentence()
            for idx, (cell, (head, deprel)) in enumerate(zip(x, y)):
                if self.use_pos and not self.config.get('joint_pos', None):
                    form, cpos = cell
                else:
                    form, cpos = cell, None
                if conll:
                    sent.append(
                        CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel) if conll == '.conll'
                        else CoNLLUWord(id=idx + 1, form=form, upos=cpos, head=head, deprel=deprel))
                else:
                    sent.append([head, deprel])
            sents.append(sent)
        return sents

    def fit(self, trn_path: str, **kwargs) -> int:
        use_pos = self.config.use_pos
        self.form_vocab = VocabTF()
        self.form_vocab.add(ROOT)  # make root the 2ed elements while 0th is pad, 1st is unk
        if self.use_pos:
            self.cpos_vocab = VocabTF(pad_token=None, unk_token=None)
        self.rel_vocab = VocabTF(pad_token=None, unk_token=None)
        num_samples = 0
        counter = Counter()
        for sent in self.file_to_samples(trn_path, gold=True):
            num_samples += 1
            for idx, cell in enumerate(sent):
                if use_pos:
                    form, cpos, head, deprel = cell
                else:
                    form, head, deprel = cell
                if idx == 0:
                    root = form
                else:
                    counter[form] += 1
                if use_pos:
                    self.cpos_vocab.add(cpos)
                self.rel_vocab.add(deprel)

        for token in [token for token, freq in counter.items() if freq >= self.config.min_freq]:
            self.form_vocab.add(token)
        return num_samples

    @property
    def root_rel_idx(self):
        root_rel_idx = self.config.get('root_rel_idx', None)
        if root_rel_idx is None:
            for idx, rel in enumerate(self.rel_vocab.idx_to_token):
                if 'root' in rel.lower() and rel != self.bos:
                    self.config['root_rel_idx'] = root_rel_idx = idx
                    break
        return root_rel_idx

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable:
        arc_preds, rel_preds, mask = Y
        sents = []

        for arc_sent, rel_sent, length in zip(arc_preds, rel_preds,
                                              tf.math.count_nonzero(mask, axis=-1)):
            arcs = tolist(arc_sent)[1:length + 1]
            rels = tolist(rel_sent)[1:length + 1]
            sents.append([(a, self.rel_vocab.idx_to_token[r]) for a, r in zip(arcs, rels)])

        return sents


class CoNLL_Transformer_Transform(CoNLL_DEP_Transform):

    def __init__(self, config: SerializableDict = None, map_x=True, map_y=True,
                 lower=True, n_buckets=32, min_freq=0, max_seq_length=256, use_pos=False,
                 mask_p=None, graph=False, topk=None,
                 **kwargs) -> None:
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.tokenizer: PreTrainedTokenizer = None
        self.transformer_config: PretrainedConfig = None
        if graph:
            self.orphan_relation = ROOT

    def lock_vocabs(self):
        super().lock_vocabs()
        if self.graph:
            CoNLL_SDP_Transform._find_orphan_relation(self)

    def fit(self, trn_path: str, **kwargs) -> int:
        if self.config.get('joint_pos', None):
            self.config.use_pos = True
        if self.graph:
            # noinspection PyCallByClass
            num = CoNLL_SDP_Transform.fit(self, trn_path, **kwargs)
        else:
            num = super().fit(trn_path, **kwargs)
        if self.config.get('topk', None):
            counter = Counter()
            for sent in self.file_to_samples(trn_path, gold=True):
                for idx, cell in enumerate(sent):
                    form, head, deprel = cell
                    counter[form] += 1
            self.topk_vocab = VocabTF()
            for k, v in counter.most_common(self.config.topk):
                self.topk_vocab.add(k)
        return num

    def inputs_to_samples(self, inputs, gold=False):
        if self.graph:
            yield from CoNLL_SDP_Transform.inputs_to_samples(self, inputs, gold)
        else:
            yield from super().inputs_to_samples(inputs, gold)

    def file_to_inputs(self, filepath: str, gold=True):
        if self.graph:
            yield from CoNLL_SDP_Transform.file_to_inputs(self, filepath, gold)
        else:
            yield from super().file_to_inputs(filepath, gold)

    @property
    def mask_p(self) -> float:
        return self.config.get('mask_p', None)

    @property
    def graph(self):
        return self.config.get('graph', None)

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        mask_p = self.mask_p
        types = (tf.int64, (tf.int64, tf.int64, tf.int64)), (tf.bool if self.graph else tf.int64, tf.int64, tf.int64) if mask_p else (
            tf.bool if self.graph else tf.int64, tf.int64)
        if self.graph:
            shapes = ([None, None], ([None, None], [None, None], [None, None])), (
                [None, None, None], [None, None, None], [None, None]) if mask_p else (
                [None, None, None], [None, None, None])
        else:
            shapes = ([None, None], ([None, None], [None, None], [None, None])), (
                [None, None], [None, None], [None, None]) if mask_p else ([None, None], [None, None])

        values = (self.form_vocab.safe_pad_token_idx, (0, 0, 0)), \
                 (0, self.rel_vocab.safe_pad_token_idx, 0) if mask_p else (0, self.rel_vocab.safe_pad_token_idx)
        types_shapes_values = types, shapes, values
        if self.use_pos:
            types_shapes_values = [((shapes[0][0], shapes[0][1] + (shapes[0][0],)), shapes[1]) for shapes in
                                   types_shapes_values]
        return types_shapes_values

    def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
        form_batch, feat, prefix_mask = X
        sents = []

        for form_sent, length in zip(form_batch, tf.math.count_nonzero(prefix_mask, axis=-1)):
            forms = tolist(form_sent)[1:length + 1]
            sents.append([self.form_vocab.idx_to_token[f] for f in forms])

        return sents

    def batched_inputs_to_batches(self, corpus, indices, shuffle):
        use_pos = self.use_pos
        if use_pos:
            raw_batch = [[], [], [], []]
        else:
            raw_batch = [[], [], []]
        if self.graph:
            max_len = len(max([corpus[i] for i in indices], key=len))
            for idx in indices:
                arc = np.zeros((max_len, max_len), dtype=np.bool)
                rel = np.zeros((max_len, max_len), dtype=np.int64)
                for b in raw_batch[:2 if use_pos else 1]:
                    b.append([])
                for m, cells in enumerate(corpus[idx]):
                    if use_pos:
                        for b, c, v in zip(raw_batch, cells, [None, self.cpos_vocab]):
                            b[-1].append(v.get_idx_without_add(c) if v else c)
                    else:
                        for b, c, v in zip(raw_batch, cells, [None]):
                            b[-1].append(c)
                    for n, r in zip(cells[-2], cells[-1]):
                        arc[m, n] = True
                        rid = self.rel_vocab.get_idx_without_add(r)
                        if rid is None:
                            logger.warning(f'Relation OOV: {r} not exists in train')
                            continue
                        rel[m, n] = rid
                raw_batch[-2].append(arc)
                raw_batch[-1].append(rel)
        else:
            for idx in indices:
                for s in raw_batch:
                    s.append([])
                for cells in corpus[idx]:
                    if use_pos:
                        for s, c, v in zip(raw_batch, cells, [None, self.cpos_vocab, None, self.rel_vocab]):
                            s[-1].append(v.get_idx_without_add(c) if v else c)
                    else:
                        for s, c, v in zip(raw_batch, cells, [None, None, self.rel_vocab]):
                            s[-1].append(v.get_idx_without_add(c) if v else c)

        # Transformer tokenizing
        config = self.transformer_config
        tokenizer = self.tokenizer
        xlnet = config_is(config, 'xlnet')
        roberta = config_is(config, 'roberta')
        pad_token = tokenizer.pad_token
        pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0]
        cls_token = tokenizer.cls_token
        sep_token = tokenizer.sep_token
        max_seq_length = self.config.max_seq_length
        batch_forms = []
        batch_input_ids = []
        batch_input_mask = []
        batch_prefix_offset = []
        mask_p = self.mask_p
        if mask_p:
            batch_masked_offsets = []
            mask_token_id = tokenizer.mask_token_id
        for sent_idx, sent in enumerate(raw_batch[0]):
            batch_forms.append([self.form_vocab.get_idx_without_add(token) for token in sent])
            sent = adjust_tokens_for_transformers(sent)
            sent = sent[1:]  # remove <root> use [CLS] instead
            pad_label_idx = self.form_vocab.pad_idx
            input_ids, input_mask, segment_ids, prefix_mask = \
                convert_examples_to_features(sent,
                                             max_seq_length,
                                             tokenizer,
                                             cls_token_at_end=xlnet,
                                             # xlnet has a cls token at the end
                                             cls_token=cls_token,
                                             cls_token_segment_id=2 if xlnet else 0,
                                             sep_token=sep_token,
                                             sep_token_extra=roberta,
                                             # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                                             pad_on_left=xlnet,
                                             # pad on the left for xlnet
                                             pad_token_id=pad_token_id,
                                             pad_token_segment_id=4 if xlnet else 0,
                                             pad_token_label_id=pad_label_idx,
                                             do_padding=False)
            num_masks = sum(prefix_mask)
            # assert len(sent) == num_masks  # each token has a True subtoken
            if num_masks < len(sent):  # long sent gets truncated, +1 for root
                batch_forms[-1] = batch_forms[-1][:num_masks + 1]  # form
                raw_batch[-1][sent_idx] = raw_batch[-1][sent_idx][:num_masks + 1]  # head
                raw_batch[-2][sent_idx] = raw_batch[-2][sent_idx][:num_masks + 1]  # rel
                raw_batch[-3][sent_idx] = raw_batch[-3][sent_idx][:num_masks + 1]  # pos
            prefix_mask[0] = True  # <root> is now [CLS]
            prefix_offset = [idx for idx, m in enumerate(prefix_mask) if m]
            batch_input_ids.append(input_ids)
            batch_input_mask.append(input_mask)
            batch_prefix_offset.append(prefix_offset)
            if mask_p:
                if shuffle:
                    size = int(np.ceil(mask_p * len(prefix_offset[1:])))  # never mask [CLS]
                    mask_offsets = np.random.choice(np.arange(1, len(prefix_offset)), size, replace=False)
                    for offset in sorted(mask_offsets):
                        assert 0 < offset < len(input_ids)
                        # mask_word = raw_batch[0][sent_idx][offset]
                        # mask_prefix = tokenizer.convert_ids_to_tokens([input_ids[prefix_offset[offset]]])[0]
                        # assert mask_word.startswith(mask_prefix) or mask_prefix.startswith(
                        #     mask_word) or mask_prefix == "'", \
                        #     f'word {mask_word} prefix {mask_prefix} not match'  # could vs couldn
                        # mask_offsets.append(input_ids[offset]) # subword token
                        # mask_offsets.append(offset)  # form token
                        input_ids[prefix_offset[offset]] = mask_token_id  # mask prefix
                        # whole word masking, mask the rest of the word
                        for i in range(prefix_offset[offset] + 1, len(input_ids) - 1):
                            if prefix_mask[i]:
                                break
                            input_ids[i] = mask_token_id

                    batch_masked_offsets.append(sorted(mask_offsets))
                else:
                    batch_masked_offsets.append([0])  # No masking in prediction

        batch_forms = tf.keras.preprocessing.sequence.pad_sequences(batch_forms, padding='post',
                                                                    value=self.form_vocab.safe_pad_token_idx,
                                                                    dtype='int64')
        batch_input_ids = tf.keras.preprocessing.sequence.pad_sequences(batch_input_ids, padding='post',
                                                                        value=pad_token_id,
                                                                        dtype='int64')
        batch_input_mask = tf.keras.preprocessing.sequence.pad_sequences(batch_input_mask, padding='post',
                                                                         value=0,
                                                                         dtype='int64')
        batch_prefix_offset = tf.keras.preprocessing.sequence.pad_sequences(batch_prefix_offset, padding='post',
                                                                            value=0,
                                                                            dtype='int64')
        batch_heads = tf.keras.preprocessing.sequence.pad_sequences(raw_batch[-2], padding='post',
                                                                    value=0,
                                                                    dtype='int64')
        batch_rels = tf.keras.preprocessing.sequence.pad_sequences(raw_batch[-1], padding='post',
                                                                   value=self.rel_vocab.safe_pad_token_idx,
                                                                   dtype='int64')
        if mask_p:
            batch_masked_offsets = tf.keras.preprocessing.sequence.pad_sequences(batch_masked_offsets, padding='post',
                                                                                 value=pad_token_id,
                                                                                 dtype='int64')
        feats = (tf.constant(batch_input_ids, dtype='int64'), tf.constant(batch_input_mask, dtype='int64'),
                 tf.constant(batch_prefix_offset))
        if use_pos:
            batch_pos = tf.keras.preprocessing.sequence.pad_sequences(raw_batch[1], padding='post',
                                                                      value=self.cpos_vocab.safe_pad_token_idx,
                                                                      dtype='int64')
            feats += (batch_pos,)
        yield (batch_forms, feats), \
              (batch_heads, batch_rels, batch_masked_offsets) if mask_p else (batch_heads, batch_rels)

    def len_of_sent(self, sent):
        # Transformer tokenizing
        config = self.transformer_config
        tokenizer = self.tokenizer
        xlnet = config_is(config, 'xlnet')
        roberta = config_is(config, 'roberta')
        pad_token = tokenizer.pad_token
        pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0]
        cls_token = tokenizer.cls_token
        sep_token = tokenizer.sep_token
        max_seq_length = self.config.max_seq_length
        sent = sent[1:]  # remove <root> use [CLS] instead
        pad_label_idx = self.form_vocab.pad_idx
        sent = [x[0] for x in sent]
        sent = adjust_tokens_for_transformers(sent)
        input_ids, input_mask, segment_ids, prefix_mask = \
            convert_examples_to_features(sent,
                                         max_seq_length,
                                         tokenizer,
                                         cls_token_at_end=xlnet,
                                         # xlnet has a cls token at the end
                                         cls_token=cls_token,
                                         cls_token_segment_id=2 if xlnet else 0,
                                         sep_token=sep_token,
                                         sep_token_extra=roberta,
                                         # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                                         pad_on_left=xlnet,
                                         # pad on the left for xlnet
                                         pad_token_id=pad_token_id,
                                         pad_token_segment_id=4 if xlnet else 0,
                                         pad_token_label_id=pad_label_idx,
                                         do_padding=False)
        return len(input_ids)

    def samples_to_dataset(self, samples: Generator, map_x=None, map_y=None, batch_size=5000, shuffle=None, repeat=None,
                           drop_remainder=False, prefetch=1, cache=True) -> tf.data.Dataset:
        if shuffle:
            return CoNLL_DEP_Transform.samples_to_dataset(self, samples, map_x, map_y, batch_size, shuffle, repeat,
                                                          drop_remainder, prefetch, cache)

        def generator():
            # custom bucketing, load corpus into memory
            corpus = list(x for x in (samples() if callable(samples) else samples))
            n_tokens = 0
            batch = []
            for idx, sent in enumerate(corpus):
                sent_len = self.len_of_sent(sent)
                if n_tokens + sent_len > batch_size and batch:
                    yield from self.batched_inputs_to_batches(corpus, batch, shuffle)
                    n_tokens = 0
                    batch = []
                n_tokens += sent_len
                batch.append(idx)
            if batch:
                yield from self.batched_inputs_to_batches(corpus, batch, shuffle)

        # debug for transformer
        # next(generator())
        return Transform.samples_to_dataset(self, generator, False, False, 0, False, repeat, drop_remainder, prefetch,
                                            cache)

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable:
        if self.graph:
            ys = CoNLL_SDP_Transform.Y_to_outputs(self, Y, gold, inputs, X)
            ys = [[([t[0] for t in l], [t[1] for t in l]) for l in y] for y in ys]
            return ys
        return super().Y_to_outputs(Y, gold, inputs, X)


class CoNLL_SDP_Transform(CoNLLTransform):

    def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32, min_freq=2,
                 use_pos=True, **kwargs) -> None:
        super().__init__(config, map_x, map_y, lower, n_buckets, min_freq, use_pos, **kwargs)
        self.orphan_relation = ROOT

    def lock_vocabs(self):
        super().lock_vocabs()
        # heuristic to find the orphan relation
        self._find_orphan_relation()

    def _find_orphan_relation(self):
        for rel in self.rel_vocab.idx_to_token:
            if 'root' in rel.lower():
                self.orphan_relation = rel
                break

    def file_to_inputs(self, filepath: str, gold=True):
        assert gold, 'only support gold file for now'
        use_pos = self.use_pos
        conllu = filepath.endswith('.conllu')
        enhanced_only = self.config.get('enhanced_only', None)
        for i, sent in enumerate(read_conll(filepath)):
            parsed_sent = []
            if conllu:
                for cell in sent:
                    ID = cell[0]
                    form = cell[1]
                    cpos = cell[3]
                    head = cell[6]
                    deprel = cell[7]
                    deps = cell[8]
                    deps = [x.split(':', 1) for x in deps.split('|')]
                    heads = [int(x[0]) for x in deps if x[0].isdigit()]
                    rels = [x[1] for x in deps if x[0].isdigit()]
                    if enhanced_only:
                        if head in heads:
                            offset = heads.index(head)
                            heads.pop(offset)
                            rels.pop(offset)
                    else:
                        if head not in heads:
                            heads.append(head)
                            rels.append(deprel)
                    parsed_sent.append([form, cpos, heads, rels] if use_pos else [form, heads, rels])
            else:
                prev_cells = None
                heads = []
                rels = []
                for j, cell in enumerate(sent):
                    ID = cell[0]
                    form = cell[1]
                    cpos = cell[3]
                    head = cell[6]
                    deprel = cell[7]
                    if prev_cells and ID != prev_cells[0]:  # found end of token
                        parsed_sent.append(
                            [prev_cells[1], prev_cells[2], heads, rels] if use_pos else [prev_cells[1], heads, rels])
                        heads = []
                        rels = []
                    heads.append(head)
                    rels.append(deprel)
                    prev_cells = [ID, form, cpos, head, deprel] if use_pos else [ID, form, head, deprel]
                parsed_sent.append(
                    [prev_cells[1], prev_cells[2], heads, rels] if use_pos else [prev_cells[1], heads, rels])
            yield parsed_sent

    def fit(self, trn_path: str, **kwargs) -> int:
        self.form_vocab = VocabTF()
        self.form_vocab.add(ROOT)  # make root the 2ed elements while 0th is pad, 1st is unk
        if self.use_pos:
            self.cpos_vocab = VocabTF(pad_token=None, unk_token=None)
        self.rel_vocab = VocabTF(pad_token=None, unk_token=None)
        num_samples = 0
        counter = Counter()
        for sent in self.file_to_samples(trn_path, gold=True):
            num_samples += 1
            for idx, cell in enumerate(sent):
                if len(cell) == 4:
                    form, cpos, head, deprel = cell
                elif len(cell) == 3:
                    if self.use_pos:
                        form, cpos = cell[0]
                    else:
                        form = cell[0]
                    head, deprel = cell[1:]
                else:
                    raise ValueError('Unknown data arrangement')
                if idx == 0:
                    root = form
                else:
                    counter[form] += 1
                if self.use_pos:
                    self.cpos_vocab.add(cpos)
                self.rel_vocab.update(deprel)

        for token in [token for token, freq in counter.items() if freq >= self.config.min_freq]:
            self.form_vocab.add(token)
        return num_samples

    def inputs_to_samples(self, inputs, gold=False):
        use_pos = self.use_pos
        for sent in inputs:
            sample = []
            for i, cell in enumerate(sent):
                if isinstance(cell, tuple):
                    cell = list(cell)
                elif isinstance(cell, str):
                    cell = [cell]
                if self.config['lower']:
                    cell[0] = cell[0].lower()
                if not gold:
                    cell += [[0], [self.rel_vocab.safe_pad_token]]
                sample.append(cell)
            # insert root word with arbitrary fields, anyway it will be masked
            if use_pos:
                form, cpos, head, deprel = sample[0]
                sample.insert(0, [self.bos, self.bos, [0], deprel])
            else:
                form, head, deprel = sample[0]
                sample.insert(0, [self.bos, [0], deprel])
            yield sample

    def batched_inputs_to_batches(self, corpus, indices, shuffle):
        use_pos = self.use_pos
        raw_batch = [[], [], [], []] if use_pos else [[], [], []]
        max_len = len(max([corpus[i] for i in indices], key=len))
        for idx in indices:
            arc = np.zeros((max_len, max_len), dtype=bool)
            rel = np.zeros((max_len, max_len), dtype=np.int64)
            for b in raw_batch[:2]:
                b.append([])
            for m, cells in enumerate(corpus[idx]):
                if use_pos:
                    for b, c, v in zip(raw_batch, cells,
                                       [self.form_vocab, self.cpos_vocab]):
                        b[-1].append(v.get_idx_without_add(c))
                else:
                    for b, c, v in zip(raw_batch, cells,
                                       [self.form_vocab]):
                        b[-1].append(v.get_idx_without_add(c))
                for n, r in zip(cells[-2], cells[-1]):
                    arc[m, n] = True
                    rid = self.rel_vocab.get_idx_without_add(r)
                    if rid is None:
                        logger.warning(f'Relation OOV: {r} not exists in train')
                        continue
                    rel[m, n] = rid
            raw_batch[-2].append(arc)
            raw_batch[-1].append(rel)
        batch = []
        for b, v in zip(raw_batch, [self.form_vocab, self.cpos_vocab]):
            b = tf.keras.preprocessing.sequence.pad_sequences(b, padding='post',
                                                              value=v.safe_pad_token_idx,
                                                              dtype='int64')
            batch.append(b)
        batch += raw_batch[2:]
        assert len(batch) == 4
        yield (batch[0], batch[1]), (batch[2], batch[3])

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = (tf.int64, tf.int64), (tf.bool, tf.int64)
        shapes = ([None, None], [None, None]), ([None, None, None], [None, None, None])
        values = (self.form_vocab.safe_pad_token_idx, self.cpos_vocab.safe_pad_token_idx), (
            False, self.rel_vocab.safe_pad_token_idx)
        return types, shapes, values

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable:
        arc_preds, rel_preds, mask = Y
        sents = []

        for arc_sent, rel_sent, length in zip(arc_preds, rel_preds,
                                              tf.math.count_nonzero(mask, axis=-1)):
            sent = []
            for arc, rel in zip(tolist(arc_sent[1:, 1:]), tolist(rel_sent[1:, 1:])):
                ar = []
                for idx, (a, r) in enumerate(zip(arc, rel)):
                    if a:
                        ar.append((idx + 1, self.rel_vocab.idx_to_token[r]))
                if not ar:
                    # orphan
                    ar.append((0, self.orphan_relation))
                sent.append(ar)
            sents.append(sent)

        return sents

    def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                             gold=False, inputs=None, conll=True) -> Iterable:
        (words, feats, mask), (arc_preds, rel_preds) = X, Y
        xs = inputs
        ys = self.Y_to_outputs((arc_preds, rel_preds, mask))
        sents = []
        for x, y in zip(xs, ys):
            sent = CoNLLSentence()
            for idx, ((form, cpos), pred) in enumerate(zip(x, y)):
                head = [p[0] for p in pred]
                deprel = [p[1] for p in pred]
                if conll:
                    sent.append(CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel))
                else:
                    sent.append([head, deprel])
            sents.append(sent)
        return sents


================================================
FILE: hanlp/transform/glue_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-08 16:34
from hanlp_common.structure import SerializableDict
from hanlp.datasets.glu.glue import STANFORD_SENTIMENT_TREEBANK_2_TRAIN, MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_DEV
from hanlp.transform.table_tf import TableTransform


class StanfordSentimentTreebank2Transorm(TableTransform):
    pass


class MicrosoftResearchParaphraseCorpus(TableTransform):

    def __init__(self, config: SerializableDict = None, map_x=False, map_y=True, x_columns=(3, 4),
                 y_column=0, skip_header=True, delimiter='auto', **kwargs) -> None:
        super().__init__(config, map_x, map_y, x_columns, y_column, skip_header, delimiter, **kwargs)


def main():
    # _test_sst2()
    _test_mrpc()


def _test_sst2():
    transform = StanfordSentimentTreebank2Transorm()
    transform.fit(STANFORD_SENTIMENT_TREEBANK_2_TRAIN)
    transform.lock_vocabs()
    transform.label_vocab.summary()
    transform.build_config()
    dataset = transform.file_to_dataset(STANFORD_SENTIMENT_TREEBANK_2_TRAIN)
    for batch in dataset.take(1):
        print(batch)


def _test_mrpc():
    transform = MicrosoftResearchParaphraseCorpus()
    transform.fit(MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_DEV)
    transform.lock_vocabs()
    transform.label_vocab.summary()
    transform.build_config()
    dataset = transform.file_to_dataset(MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_DEV)
    for batch in dataset.take(1):
        print(batch)

================================================
FILE: hanlp/transform/table_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-11-10 21:00
from abc import ABC
from typing import Tuple, Union
import numpy as np
import tensorflow as tf

from hanlp_common.structure import SerializableDict
from hanlp.common.transform_tf import Transform
from hanlp_common.constant import PAD
from hanlp.common.vocab_tf import create_label_vocab
from hanlp.utils.io_util import read_cells
from hanlp.utils.log_util import logger


class TableTransform(Transform, ABC):
    def __init__(self, config: SerializableDict = None, map_x=False, map_y=True, x_columns=None,
                 y_column=-1, multi_label=False,
                 skip_header=True, delimiter='auto', **kwargs) -> None:
        super().__init__(config, map_x, map_y, x_columns=x_columns, y_column=y_column, multi_label=multi_label,
                         skip_header=skip_header,
                         delimiter=delimiter, **kwargs)
        self.label_vocab = create_label_vocab()

    def file_to_inputs(self, filepath: str, gold=True):
        x_columns = self.config.x_columns
        y_column = self.config.y_column
        num_features = self.config.get('num_features', None)
        for cells in read_cells(filepath, skip_header=self.config.skip_header, delimiter=self.config.delimiter):
            #multi-label: Dataset in .tsv format: x_columns: at most 2 columns being a sentence pair while in most
            # cases just one column being the doc content. y_column being the single label, which shall be modified
            # to load a list of labels.
            if x_columns:
                inputs = tuple(c for i, c in enumerate(cells) if i in x_columns), cells[y_column]
            else:
                if y_column != -1:
                    cells[-1], cells[y_column] = cells[y_column], cells[-1]
                inputs = tuple(cells[:-1]), cells[-1]
            if num_features is None:
                num_features = len(inputs[0])
                self.config.num_features = num_features
            # multi-label support
            if self.config.get('multi_label', None):
                assert type(inputs[1]) is str, 'Y value has to be string'
                if inputs[1][0] == '[':
                    # multi-label is in literal form of a list
                    labels = eval(inputs[1])
                else:
                    labels = inputs[1].strip().split(',')
                inputs = inputs[0], labels
            else:
                assert num_features == len(inputs[0]), f'Numbers of columns {num_features} ' \
                                                       f'inconsistent with current {len(inputs[0])}'
            yield inputs

    def inputs_to_samples(self, inputs, gold=False):
        pad = self.label_vocab.safe_pad_token
        for cells in inputs:
            if gold:
                yield cells
            else:
                yield cells, pad

    def y_to_idx(self, y) -> tf.Tensor:
        return self.label_vocab.lookup(y)

    def fit(self, trn_path: str, **kwargs):
        samples = 0
        for t in self.file_to_samples(trn_path, gold=True):
            if self.config.get('multi_label', None):
                for l in t[1]:
                    self.label_vocab.add(l)
            else:
                self.label_vocab.add(t[1])  # the second one regardless of t is pair or triple
            samples += 1
        return samples

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        num_features = self.config.num_features
        # It's crucial to use tuple instead of list for all the three
        types = tuple([tf.string] * num_features), tf.string
        shapes = tuple([[]] * num_features), []
        values = tuple([PAD] * num_features), self.label_vocab.safe_pad_token
        return types, shapes, values

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        logger.warning('TableTransform can not map x to idx. Please override x_to_idx')
        return x


================================================
FILE: hanlp/transform/tacred_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-14 17:06
from typing import Union, Tuple

import tensorflow as tf

from hanlp_common.structure import SerializableDict
from hanlp.common.transform_tf import Transform
from hanlp.common.vocab_tf import VocabTF
from hanlp_common.io import load_json
from hanlp_common.util import merge_locals_kwargs


def get_positions(start_idx, end_idx, length):
    """Get subj/obj position sequence.

    Args:
      start_idx: 
      end_idx: 
      length: 

    Returns:

    """
    return list(range(-start_idx, 0)) + [0] * (end_idx - start_idx + 1) + \
           list(range(1, length - end_idx))


class TACREDTransform(Transform):
    def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=False, **kwargs) -> None:
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.token_vocab = VocabTF()
        self.pos_vocab = VocabTF(pad_token=None, unk_token=None)
        self.ner_vocab = VocabTF(pad_token=None)
        self.deprel_vocab = VocabTF(pad_token=None, unk_token=None)
        self.rel_vocab = VocabTF(pad_token=None, unk_token=None)

    def fit(self, trn_path: str, **kwargs) -> int:
        count = 0
        for (tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type,
             obj_type), relation in self.file_to_samples(
            trn_path, gold=True):
            count += 1
            self.token_vocab.update(tokens)
            self.pos_vocab.update(pos)
            self.ner_vocab.update(ner)
            self.deprel_vocab.update(deprel)
            self.rel_vocab.add(relation)
        return count

    def file_to_inputs(self, filepath: str, gold=True):
        data = load_json(filepath)
        for d in data:
            tokens = list(d['token'])
            ss, se = d['subj_start'], d['subj_end']
            os, oe = d['obj_start'], d['obj_end']
            pos = d['stanford_pos']
            ner = d['stanford_ner']
            deprel = d['stanford_deprel']
            head = [int(x) for x in d['stanford_head']]
            assert any([x == 0 for x in head])
            relation = d['relation']
            yield (tokens, pos, ner, head, deprel, ss, se, os, oe), relation

    def inputs_to_samples(self, inputs, gold=False):
        for input in inputs:
            if gold:
                (tokens, pos, ner, head, deprel, ss, se, os, oe), relation = input
            else:
                tokens, pos, ner, head, deprel, ss, se, os, oe = input
                relation = self.rel_vocab.safe_pad_token
            l = len(tokens)
            subj_positions = get_positions(ss, se, l)
            obj_positions = get_positions(os, oe, l)
            subj_type = ner[ss]
            obj_type = ner[os]
            # anonymize tokens
            tokens[ss:se + 1] = ['SUBJ-' + subj_type] * (se - ss + 1)
            tokens[os:oe + 1] = ['OBJ-' + obj_type] * (oe - os + 1)
            # min head is 0, but root is not included in tokens, so take 1 off from each head
            head = [h - 1 for h in head]
            yield (tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type), relation

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        # (tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type), relation
        types = (tf.string, tf.string, tf.string, tf.int32, tf.string, tf.int32, tf.int32, tf.string,
                 tf.string), tf.string
        shapes = ([None], [None], [None], [None], [None], [None], [None], [], []), []
        pads = (self.token_vocab.safe_pad_token, self.pos_vocab.safe_pad_token, self.ner_vocab.safe_pad_token, 0,
                self.deprel_vocab.safe_pad_token,
                0, 0, self.ner_vocab.safe_pad_token, self.ner_vocab.safe_pad_token), self.rel_vocab.safe_pad_token
        return types, shapes, pads

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type = x
        tokens = self.token_vocab.lookup(tokens)
        pos = self.pos_vocab.lookup(pos)
        ner = self.ner_vocab.lookup(ner)
        deprel = self.deprel_vocab.lookup(deprel)
        subj_type = self.ner_vocab.lookup(subj_type)
        obj_type = self.ner_vocab.lookup(obj_type)
        return tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type

    def y_to_idx(self, y) -> tf.Tensor:
        return self.rel_vocab.lookup(y)


================================================
FILE: hanlp/transform/text_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-04 11:46
from typing import Union, Tuple, Iterable, Any

import tensorflow as tf

from hanlp_common.structure import SerializableDict
from hanlp.common.transform_tf import Transform
from hanlp.common.vocab_tf import VocabTF
from hanlp.metrics.chunking.sequence_labeling import get_entities
from hanlp.utils.file_read_backwards import FileReadBackwards
from hanlp.utils.io_util import read_tsv_as_sents


class TextTransform(Transform):

    def __init__(self,
                 forward=True,
                 seq_len=10,
                 tokenizer='char',
                 config: SerializableDict = None, map_x=True, map_y=True, **kwargs) -> None:
        super().__init__(config, map_x, map_y, seq_len=seq_len, tokenizer=tokenizer, forward=forward, **kwargs)
        self.vocab: VocabTF = None

    def tokenize_func(self):
        if self.config.tokenizer == 'char':
            return list
        elif self.config.tokenizer == 'whitespace':
            return lambda x: x.split()
        else:
            return lambda x: x.split(self.config.tokenizer)

    def fit(self, trn_path: str, **kwargs) -> int:
        self.vocab = VocabTF()
        num_samples = 0
        for x, y in self.file_to_inputs(trn_path):
            self.vocab.update(x)
            num_samples += 1
        return num_samples

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = tf.string, tf.string
        shapes = [None], [None]
        defaults = self.vocab.pad_token, self.vocab.pad_token
        return types, shapes, defaults

    def file_to_inputs(self, filepath: str, gold=True):
        forward = self.config.forward
        seq_len = self.config.seq_len
        buffer = []
        tokenizer = self.tokenize_func()
        with open(filepath, encoding='utf-8') if forward else FileReadBackwards(filepath, encoding="utf-8") as src:
            for line in src:
                tokens = tokenizer(line)
                buffer += tokens
                while len(buffer) > seq_len:
                    yield buffer[:seq_len], buffer[1:1 + seq_len]
                    buffer.pop(0)

    def inputs_to_samples(self, inputs, gold=False):
        forward = self.config.forward
        for t in inputs:
            if gold:
                x, y = t
            else:
                x, y = t, t
            if not forward:
                x = list(reversed(x))
                y = list(reversed(y))
            yield x, y

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        return self.vocab.lookup(x)

    def y_to_idx(self, y) -> tf.Tensor:
        return self.x_to_idx(y)

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, **kwargs) -> Iterable:
        pred = tf.argmax(Y, axis=-1)
        for ys, ms in zip(pred, inputs):
            ret = []
            for y in ys:
                ret.append(self.vocab.idx_to_token[int(y)])
            yield ret

    def input_is_single_sample(self, input: Any) -> bool:
        return isinstance(input[0], str)


def bmes_to_flat(inpath, outpath):
    with open(outpath, 'w', encoding='utf-8') as out:
        for sent in read_tsv_as_sents(inpath):
            chunks = get_entities([cells[1] for cells in sent])
            chars = [cells[0] for cells in sent]
            words = []
            for tag, start, end in chunks:
                word = ''.join(chars[start: end])
                words.append(word)
            out.write(' '.join(f'{word}/{tag}' for word, (tag, _, _) in zip(words, chunks)))
            out.write('\n')

================================================
FILE: hanlp/transform/transformer_tokenizer.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-03 16:23
import warnings
from typing import Union, Optional

from hanlp_common.constant import BOS, EOS
from hanlp_common.structure import SerializableDict
from hanlp.layers.transformers.pt_imports import PreTrainedTokenizer, PretrainedConfig, AutoTokenizer_
from hanlp_trie import DictInterface


class TransformerTokenizer(object):

    def __init__(self, max_seq_length=512, truncate_long_sequences=True) -> None:
        self.truncate_long_sequences = truncate_long_sequences
        self.max_seq_length = max_seq_length

    def sliding_window(self, flat_wordpiece_ids, same_tail=True):
        if same_tail:
            start_piece_ids, flat_wordpiece_ids, end_piece_ids = flat_wordpiece_ids[:1], \
                                                                 flat_wordpiece_ids[1:-1], flat_wordpiece_ids[-1:]
        else:
            start_piece_ids, flat_wordpiece_ids, end_piece_ids = flat_wordpiece_ids[:1], \
                                                                 flat_wordpiece_ids[1:], []
        window_length = self.max_seq_length - len(start_piece_ids) - len(end_piece_ids)
        stride = window_length // 2
        wordpiece_windows = [start_piece_ids + flat_wordpiece_ids[i:i + window_length] + end_piece_ids
                             for i in range(0, len(flat_wordpiece_ids), stride)]

        # Check for overlap in the last window. Throw it away if it is redundant.
        last_window = wordpiece_windows[-1][1:]
        penultimate_window = wordpiece_windows[-2]
        if last_window == penultimate_window[-len(last_window):]:
            wordpiece_windows = wordpiece_windows[:-1]

        wordpiece_ids = [wordpiece for sequence in wordpiece_windows for wordpiece in sequence]
        return wordpiece_ids


class TransformerTextTokenizer(TransformerTokenizer):
    _KEY = ['input_ids', 'attention_mask', 'token_type_ids']

    def __init__(self,
                 tokenizer: Union[PreTrainedTokenizer, str],
                 text_a_key: str,
                 text_b_key: str = None,
                 output_key=None,
                 max_seq_length=512, truncate_long_sequences=True) -> None:
        super().__init__(max_seq_length, truncate_long_sequences)
        self.text_b = text_b_key
        self.text_a = text_a_key
        if output_key is None:
            output_key = self.text_a
            if text_b_key:
                output_key += '_' + text_b_key
        if output_key == '':
            output_key = self._KEY
        else:
            output_key = [f'{output_key}_{key}' for key in self._KEY]
        self.output_key = output_key
        if isinstance(tokenizer, str):
            tokenizer = AutoTokenizer_.from_pretrained(tokenizer)
        self.tokenizer = tokenizer

    def __call__(self, sample: dict):
        text_a = sample[self.text_a]
        text_b = sample[self.text_b] if self.text_b else None
        max_seq_length = self.max_seq_length if self.truncate_long_sequences else None
        encoding = self.tokenizer.encode_plus(text_a, text_b, max_length=max_seq_length)
        results = dict((k, encoding.data.get(k, None)) for k in self._KEY)
        if not self.truncate_long_sequences and len(results['input_ids']) > self.max_seq_length:
            # TODO: other fields should be properly handled too
            results['input_ids'] = self.sliding_window(results['input_ids'])
        if not results['token_type_ids']:
            results['token_type_ids'] = encoding[0].type_ids
        for k, v in zip(self.output_key, [results[_] for _ in self._KEY]):
            sample[k] = v
        return sample


class TransformerSequenceTokenizer(TransformerTokenizer):

    def __init__(self,
                 tokenizer: Union[PreTrainedTokenizer, str],
                 input_key,
                 output_key=None,
                 max_seq_length=512,
                 truncate_long_sequences=False,
                 config: PretrainedConfig = None,
                 cls_token_at_end=False,
                 cls_token_segment_id=0,
                 pad_token_segment_id=0,
                 pad_on_left=False,
                 do_padding=False,
                 sep_token_extra=False,
                 ret_mask_and_type=False,
                 ret_prefix_mask=False,
                 ret_token_span=True,
                 ret_subtokens=False,
                 ret_subtokens_group=False,
                 cls_is_bos=False,
                 sep_is_eos=False,
                 do_basic_tokenize=True,
                 use_fast=True,
                 dict_force=None,
                 strip_cls_sep=True,
                 check_space_before=None,
                 ) -> None:
        """A transformer tokenizer for token-level tasks. It honors the boundary of tokens and tokenize each token into
        several subtokens then merge them. The information about each subtoken belongs to which token are kept and
        returned as a new field in the sample. It also provides out-of-box sliding window trick on long sequences.

        Args:
            tokenizer: The identifier of a pre-trained tokenizer or a ``PreTrainedTokenizer``.
            input_key: The token key in samples.
            output_key: The output keys to store results.
                max_seq_length: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
            truncate_long_sequences: ``True`` to truncate exceeded parts of long sequences. ``False`` to  enable
                sliding window.
            config: The ``PretrainedConfig`` to determine the model structure of the transformer, so that special
                tokenization can be applied.
            cls_token_at_end: ``True`` to put ``[CLS]`` at the end of input tokens.
            cls_token_segment_id: The id of ``[CLS]``.
            pad_token_segment_id: The id of ``[SEP]``.
            pad_on_left: ``True`` to put ``[PAD]`` at the left side of input tokens.
            do_padding: ``True`` to pad sequence to the left.
            sep_token_extra: ``True`` to have two ``[SEP]``.
            ret_mask_and_type: ``True`` to return masks and type ids.
            ret_prefix_mask: ``True`` to generate a mask where each non-zero element corresponds to a prefix of a token.
            ret_token_span: ``True`` to return span of each token measured by subtoken offsets.
            ret_subtokens: ``True`` to return list of subtokens belonging to each token for tokenization purpose.
                When enabled, the prefix mask for each subtoken is set to True as each subtoken is a token unit in
                tokenization task. Similarity, the token span for each token will be a continuous integer sequence.
            ret_subtokens_group: ``True`` to return list of offsets of subtokens belonging to each token.
            cls_is_bos: ``True`` means the first token of input is treated as [CLS] no matter what its surface form is.
                        ``False`` (default) means the first token is not [CLS], it will have its own embedding other than
                        the embedding of [CLS].
            sep_is_eos: ``True`` means the last token of input is [SEP].
                        ``False`` means it's not but [SEP] will be appended,
                        ``None`` means it dependents on `input[-1] == [EOS]`.
            do_basic_tokenize: Whether to do basic tokenization before wordpiece.
            use_fast: Whether or not to try to load the fast version of the tokenizer.
            dict_force: A dictionary doing longest-prefix-match on input text so that the head and tail of each keyword
                won't be concatenated to other tokens by transformer tokenizers.
            strip_cls_sep: ``True`` to strip [CLS] and [SEP] off the input tokens.
            check_space_before: ``True`` to detect the space before each token to handle underline in sentence piece
                tokenization.

        Examples:

        .. highlight:: python
        .. code-block:: python

            transform = TransformerSequenceTokenizer('bert-base-uncased', 'token')
            sample = {'token': 'HanLP good'.split()}
            print(transform(sample))

        """
        super().__init__(max_seq_length, truncate_long_sequences)
        tokenizer_name = tokenizer if isinstance(tokenizer, str) else tokenizer.name_or_path
        if check_space_before is None:
            # These tokenizer is BPE-based which appends a space before each token and tokenizes loving into
            # ['▁lo', 'ving'], tokenize 商品 into ['▁', '商品']. For the later case, the prefix '▁' has to be removed
            # as there is no space between some languages like Chinese
            check_space_before = tokenizer_name in ('xlm-roberta-base', 'xlm-roberta-large', 'google/mt5-small',
                                                    'google/mt5-base', 'xlm-roberta-base-no-space',
                                                    'mMiniLMv2L6-no-space', 'mMiniLMv2L12-no-space')
        self.check_space_before = check_space_before
        self.ret_subtokens_group = ret_subtokens_group
        self.ret_subtokens = ret_subtokens
        self.sep_is_eos = sep_is_eos
        self.ret_prefix_mask = ret_prefix_mask
        self.ret_mask_and_type = ret_mask_and_type
        self.cls_is_bos = cls_is_bos
        self.ret_token_span = ret_token_span
        if not output_key or isinstance(output_key, str):
            suffixes = ['input_ids']
            if ret_mask_and_type:
                suffixes += 'attention_mask', 'token_type_ids'
            if ret_prefix_mask:
                suffixes += ['prefix_mask']
            if ret_token_span:
                suffixes.append('token_span')
            if output_key is None:
                output_key = [f'{input_key}_{key}' for key in suffixes]
            elif output_key == '':
                output_key = suffixes
            else:
                output_key = [f'{output_key}_{key}' for key in suffixes]

        self.input_key = input_key
        self.output_key = output_key
        if config:
            xlnet = config_is(config, 'xlnet')
            pad_token_segment_id = 4 if xlnet else 0
            cls_token_segment_id = 2 if xlnet else 0
            cls_token_at_end = xlnet
            pad_on_left = xlnet
        if isinstance(tokenizer, str):
            tokenizer = AutoTokenizer_.from_pretrained(tokenizer, use_fast=use_fast,
                                                       do_basic_tokenize=do_basic_tokenize)
        if use_fast:
            # Dirty fix upstream bug: https://github.com/hankcs/HanLP/issues/1602
            if hasattr(tokenizer, '_tokenizer') and hasattr(tokenizer._tokenizer, 'no_truncation'):
                _t = tokenizer._tokenizer
                _t.no_truncation()
                _t.no_padding()
                _t.no_truncation = _t.no_padding = lambda: None
        pad_token = tokenizer.pad_token
        self.pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0]
        self.pad_token_segment_id = pad_token_segment_id
        if tokenizer_name in ('google/mt5-small', 'google/mt5-base'):
            # mt5 doesn't have cls or sep, but we can use something similar
            self.has_cls = False
            self.cls_token = '▁'
            self.cls_token_id = tokenizer.convert_tokens_to_ids(self.cls_token)
            self.sep_token = tokenizer.eos_token
            self.sep_token_id = tokenizer.eos_token_id
        else:
            self.has_cls = True
            self.cls_token = tokenizer.cls_token
            self.sep_token = tokenizer.sep_token
            self.cls_token_segment_id = cls_token_segment_id
            self.cls_token_id = tokenizer.cls_token_id
            self.sep_token_id = tokenizer.sep_token_id

        self.sep_token_extra = sep_token_extra
        self.cls_token_at_end = cls_token_at_end
        self.tokenizer = tokenizer
        self.pad_on_left = pad_on_left
        self.do_padding = do_padding
        if self.ret_token_span or not self.truncate_long_sequences:
            assert not self.cls_token_at_end
            assert not self.pad_on_left
        # if self.ret_subtokens:
        #     if not use_fast:
        #         raise NotImplementedError(
        #             'ret_subtokens is not available when using Python tokenizers. '
        #             'To use this feature, set use_fast = True.')
        self.dict: Optional[DictInterface] = dict_force  # For tokenization of raw text
        self.strip_cls_sep = strip_cls_sep

    def __call__(self, sample: dict):
        input_tokens = sample[self.input_key]
        input_is_str = isinstance(input_tokens, str)
        tokenizer = self.tokenizer
        ret_token_span = self.ret_token_span
        if input_is_str:  # This happens in a tokenizer component where the raw sentence is fed.

            # noinspection PyShadowingNames
            def tokenize_str(input_str, add_special_tokens=True):
                if tokenizer.is_fast:
                    encoding = tokenizer.encode_plus(input_str,
                                                     return_offsets_mapping=True,
                                                     add_special_tokens=add_special_tokens).encodings[0]
                    subtoken_offsets = encoding.offsets
                    input_tokens = encoding.tokens
                    input_ids = encoding.ids

                    # Fill up missing non-blank characters swallowed by HF tokenizer
                    offset = 0
                    fixed_offsets = []
                    fixed_tokens = []
                    fixed_ids = []
                    for token, id, (b, e) in zip(input_tokens, input_ids, subtoken_offsets):
                        if b > offset:
                            missing_token = input_str[offset: b]
                            if not missing_token.isspace():  # In the future, we may want space back
                                fixed_tokens.append(missing_token)
                                fixed_ids.append(tokenizer.unk_token_id)
                                fixed_offsets.append((offset, b))
                        if e == offset:  # LI™ -> LIT + M
                            if fixed_offsets and fixed_offsets[-1][0] < b:
                                fixed_offsets[-1] = (fixed_offsets[-1][0], b)

                        fixed_tokens.append(token)
                        fixed_ids.append(id)
                        fixed_offsets.append((b, e))
                        offset = e
                    subtoken_offsets = fixed_offsets
                    input_tokens = fixed_tokens
                    input_ids = fixed_ids

                    if add_special_tokens:
                        subtoken_offsets = subtoken_offsets[1 if self.has_cls else 0:-1]

                    # Edge case that the input_str is swallowed in whole
                    if input_str and not subtoken_offsets and not input_str.isspace():
                        __index = 1 if add_special_tokens and self.has_cls else 0
                        input_tokens.insert(__index, input_str)
                        input_ids.insert(__index, tokenizer.unk_token_id)
                        subtoken_offsets.append((0, len(input_str)))

                    if not self.has_cls:
                        input_tokens = [self.cls_token] + input_tokens
                        input_ids = [self.cls_token_id] + input_ids
                else:
                    input_tokens = tokenizer.tokenize(input_str)
                    subtoken_offsets = []
                    _o = 0
                    for each in input_tokens:
                        subtoken_offsets.append((_o, _o + len(each)))
                        _o += len(each)
                    if add_special_tokens:
                        input_tokens = [self.cls_token] + input_tokens + [self.sep_token]
                    input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
                if self.check_space_before:
                    non_blank_offsets = [i for i in range(len(input_tokens)) if input_tokens[i] != '▁']
                    if add_special_tokens and not self.has_cls:
                        non_blank_offsets.insert(0, 0)
                    input_tokens = [input_tokens[i] for i in non_blank_offsets]
                    input_ids = [input_ids[i] for i in non_blank_offsets]
                    if add_special_tokens:
                        non_blank_offsets = non_blank_offsets[1:-1]
                        subtoken_offsets = [subtoken_offsets[i - 1] for i in non_blank_offsets]
                    else:
                        subtoken_offsets = [subtoken_offsets[i] for i in non_blank_offsets]
                    # MT5 generates tokens like ▁of, which is bad for the tokenizer. So we want to remove the prefix.
                    for i, token in enumerate(input_tokens[1:-1] if add_special_tokens else input_tokens):
                        if input_str[subtoken_offsets[i][0]] == ' ':
                            subtoken_offsets[i] = (subtoken_offsets[i][0] + 1, subtoken_offsets[i][1])
                # The following block will tokenize each empty string (space) into an unk token
                # if add_special_tokens:
                #     if len(input_tokens) == 2:  # bos and eos, meaning that the text contains only some spaces
                #         input_tokens.insert(1, input_str)
                #         input_ids.insert(1, tokenizer.unk_token_id)
                #         subtoken_offsets.append((0, len(input_str)))
                # else:
                #     if not input_ids:  # This chunk might be some control chars getting removed by tokenizer
                #         input_tokens = [input_str]
                #         input_ids = [tokenizer.unk_token_id]
                #         subtoken_offsets = [(0, len(input_str))]
                return input_tokens, input_ids, subtoken_offsets

            if self.dict:
                chunks = self.dict.split(sample.get(f'{self.input_key}_', input_tokens))  # Match original text directly
                _input_tokens, _input_ids, _subtoken_offsets = [self.cls_token], [self.cls_token_id], []
                _offset = 0
                custom_words = sample['custom_words'] = []
                char_offset = 0
                for chunk in chunks:
                    if isinstance(chunk, str):  # Use transformed text as it's what models are trained on
                        chunk = input_tokens[char_offset:char_offset + len(chunk)]
                        tokens, ids, offsets = tokenize_str(chunk, add_special_tokens=False)
                        char_offset += len(chunk)
                    else:
                        begin, end, label = chunk
                        _offset = begin
                        # chunk offset is on char level, at this moment, there is no concept of tokens, just subtokens
                        if isinstance(label, list):
                            tokens, ids, offsets, delta = [], [], [], 0
                            for token in label:
                                _tokens, _ids, _offsets = tokenize_str(token, add_special_tokens=False)
                                tokens.extend(_tokens)
                                # track the subword offset of this chunk, -1 for [CLS]
                                custom_words.append(
                                    (len(_input_ids) + len(ids) - 1, len(_input_ids) + len(ids) - 1 + len(_ids), token))
                                ids.extend(_ids)
                                offsets.extend((x[0] + delta, x[1] + delta) for x in _offsets)
                                delta = offsets[-1][-1]
                        else:
                            tokens, ids, offsets = tokenize_str(input_tokens[begin:end], add_special_tokens=False)
                            # offsets = [(offsets[0][0], offsets[-1][-1])]
                            custom_words.append((len(_input_ids) - 1, len(_input_ids) + len(ids) - 1, label))
                        char_offset = end
                    _input_tokens.extend(tokens)
                    _input_ids.extend(ids)
                    _subtoken_offsets.extend((x[0] + _offset, x[1] + _offset) for x in offsets)
                    _offset = _subtoken_offsets[-1][-1]
                subtoken_offsets = _subtoken_offsets
                input_tokens = _input_tokens + [self.sep_token]
                input_ids = _input_ids + [self.sep_token_id]
            else:
                input_tokens, input_ids, subtoken_offsets = tokenize_str(input_tokens, add_special_tokens=True)

            if self.ret_subtokens:
                sample[f'{self.input_key}_subtoken_offsets'] = subtoken_offsets

        cls_is_bos = self.cls_is_bos
        if cls_is_bos is None:
            cls_is_bos = input_tokens[0] == BOS
        sep_is_eos = self.sep_is_eos
        if sep_is_eos is None:
            sep_is_eos = input_tokens[-1] == EOS
        if self.strip_cls_sep:
            if cls_is_bos:
                input_tokens = input_tokens[1:]
            if sep_is_eos:
                input_tokens = input_tokens[:-1]
        if not self.ret_mask_and_type:  # only need input_ids and token_span, use a light version
            if input_is_str:
                prefix_mask = self._init_prefix_mask(input_ids)
            else:
                if input_tokens:
                    return_offsets_mapping = tokenizer.is_fast and self.ret_subtokens
                    encodings = tokenizer.batch_encode_plus(
                        input_tokens,
                        return_offsets_mapping=return_offsets_mapping,  # Many tokenizers do not offer fast version
                        add_special_tokens=False
                    )
                    subtoken_ids_per_token = encodings.data['input_ids']
                    if return_offsets_mapping:
                        offsets_mapping = [encoding.offsets for encoding in encodings.encodings]
                    else:
                        offsets_mapping = []
                        for token, subtoken_ids in zip(input_tokens, subtoken_ids_per_token):
                            if len(subtoken_ids) > len(token):  # … --> ...
                                del subtoken_ids[len(token):]
                            if not subtoken_ids:
                                subtoken_ids = [tokenizer.unk_token_id]
                            # Since non-fast tok generates no mapping, we have to guess
                            char_per_subtoken = max(len(token) // len(subtoken_ids), 1)
                            bes = [(b, b + char_per_subtoken) for b in range(0, len(token), char_per_subtoken)]
                            if not bes:  # the token is an empty string
                                bes = [(0, 0)]
                            if len(bes) != len(subtoken_ids):
                                bes[len(subtoken_ids) - 1] = (bes[len(subtoken_ids) - 1][0], len(token))
                                del bes[len(subtoken_ids):]
                            offsets_mapping.append(bes)
                else:
                    encodings = SerializableDict()
                    subtoken_ids_per_token = []
                    encodings.data = {'input_ids': subtoken_ids_per_token}
                if self.check_space_before:
                    # noinspection PyUnboundLocalVariable
                    for token, subtokens, mapping, encoding in zip(input_tokens, subtoken_ids_per_token,
                                                                   offsets_mapping, encodings.encodings):
                        # Remove ▁ generated by spm for 2 reasons:
                        # 1. During decoding, mostly no ▁ will be created unless blanks are placed between tokens (which
                        # is true for English but in English it will likely be concatenated to the token following it)
                        # 2. For T5, '▁' is used as CLS
                        if len(subtokens) > 1 and encoding.tokens[0] == '▁':
                            subtokens.pop(0)
                            if mapping:
                                mapping.pop(0)
                # Some tokens get stripped out
                subtoken_ids_per_token = [ids if ids else [tokenizer.unk_token_id] for ids in subtoken_ids_per_token]
                input_ids = sum(subtoken_ids_per_token, [self.cls_token_id])
                if self.sep_is_eos is None:
                    # None means to check whether sep is at the tail or between tokens
                    if sep_is_eos:
                        input_ids += [self.sep_token_id]
                    elif self.sep_token_id not in input_ids:
                        input_ids += [self.sep_token_id]
                else:
                    input_ids += [self.sep_token_id]
                # else self.sep_is_eos == False means sep is between tokens and don't bother to check

                if self.ret_subtokens:
                    prefix_mask = self._init_prefix_mask(input_ids)
                    # if self.check_space_before:
                    #     if offsets_mapping[0] and not input_tokens[0].startswith(' '):
                    #         prefix_mask[1] = False
                else:
                    prefix_mask = [False] * len(input_ids)
                    offset = 1
                    for _subtokens in subtoken_ids_per_token:
                        prefix_mask[offset] = True
                        offset += len(_subtokens)
                if self.ret_subtokens:
                    subtoken_offsets = []
                    for token, offsets in zip(input_tokens, offsets_mapping):
                        if offsets:
                            subtoken_offsets.append(offsets)
                        else:
                            subtoken_offsets.append([(0, len(token))])
                    if self.ret_subtokens_group:
                        sample[f'{self.input_key}_subtoken_offsets_group'] = subtoken_offsets
                    else:
                        sample[f'{self.input_key}_subtoken_offsets'] = sum(subtoken_offsets, [])
        else:
            input_ids, attention_mask, token_type_ids, prefix_mask = \
                convert_examples_to_features(input_tokens,
                                             None,
                                             tokenizer,
                                             cls_token_at_end=self.cls_token_at_end,
                                             # xlnet has a cls token at the end
                                             cls_token=tokenizer.cls_token,
                                             cls_token_segment_id=self.cls_token_segment_id,
                                             sep_token=self.sep_token,
                                             sep_token_extra=self.sep_token_extra,
                                             # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                                             pad_on_left=self.pad_on_left,
                                             # pad on the left for xlnet
                                             pad_token_id=self.pad_token_id,
                                             pad_token_segment_id=self.pad_token_segment_id,
                                             pad_token_label_id=0,
                                             do_padding=self.do_padding)
        if len(input_ids) > self.max_seq_length:
            if self.truncate_long_sequences:
                # raise SequenceTooLong(
                #     f'Input tokens {input_tokens} exceed the max sequence length of {self.max_seq_length - 2}. '
                #     f'For sequence tasks, truncate_long_sequences = True is not supported.'
                #     f'You are recommended to split your long text into several sentences within '
                #     f'{self.max_seq_length - 2} tokens beforehand. '
                #     f'Or simply set truncate_long_sequences = False to enable sliding window.')
                input_ids = input_ids[:self.max_seq_length]
                prefix_mask = prefix_mask[:self.max_seq_length]
                warnings.warn(
                    f'Input tokens {input_tokens} exceed the max sequence length of {self.max_seq_length - 2}. '
                    f'The exceeded part will be truncated and ignored. '
                    f'You are recommended to split your long text into several sentences within '
                    f'{self.max_seq_length - 2} tokens beforehand.'
                    f'Or simply set truncate_long_sequences = False to enable sliding window.'
                )
            else:
                input_ids = self.sliding_window(input_ids, input_ids[-1] == self.sep_token_id)
        if prefix_mask:
            if cls_is_bos:
                prefix_mask[0] = True
            if sep_is_eos:
                prefix_mask[-1] = True
        outputs = [input_ids]
        if self.ret_mask_and_type:
            # noinspection PyUnboundLocalVariable
            outputs += [attention_mask, token_type_ids]
        if self.ret_prefix_mask:
            outputs += [prefix_mask]
        if ret_token_span and prefix_mask:
            if cls_is_bos:
                token_span = [[0]]
            else:
                token_span = []
            offset = 1
            span = []
            for mask in prefix_mask[1:len(prefix_mask) if sep_is_eos is None else -1]:  # skip [CLS] and [SEP]
                if mask and span:
                    token_span.append(span)
                    span = []
                span.append(offset)
                offset += 1
            if span:
                token_span.append(span)
            if sep_is_eos:
                assert offset == len(prefix_mask) - 1
                token_span.append([offset])
            outputs.append(token_span)
        for k, v in zip(self.output_key, outputs):
            sample[k] = v
        return sample

    def _init_prefix_mask(self, input_ids):
        prefix_mask = [True] * len(input_ids)
        if not self.cls_is_bos:
            prefix_mask[0] = False
        if not self.sep_is_eos:
            prefix_mask[-1] = False
        return prefix_mask


def config_is(config, model='bert'):
    return model in type(config).__name__.lower()


def convert_examples_to_features(
        words,
        max_seq_length: Optional[int],
        tokenizer,
        labels=None,
        label_map=None,
        cls_token_at_end=False,
        cls_token="[CLS]",
        cls_token_segment_id=1,
        sep_token="[SEP]",
        sep_token_extra=False,
        pad_on_left=False,
        pad_token_id=0,
        pad_token_segment_id=0,
        pad_token_label_id=0,
        sequence_a_segment_id=0,
        mask_padding_with_zero=True,
        unk_token='[UNK]',
        do_padding=True
):
    """Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)

    Args:
      words: 
      max_seq_length: 
      tokenizer: 
      labels:  (Default value = None)
      label_map:  (Default value = None)
      cls_token_at_end:  (Default value = False)
      cls_token:  (Default value = "[CLS]")
      cls_token_segment_id:  (Default value = 1)
      sep_token:  (Default value = "[SEP]")
      sep_token_extra:  (Default value = False)
      pad_on_left:  (Default value = False)
      pad_token_id:  (Default value = 0)
      pad_token_segment_id:  (Default value = 0)
      pad_token_label_id:  (Default value = 0)
      sequence_a_segment_id:  (Default value = 0)
      mask_padding_with_zero:  (Default value = True)
      unk_token:  (Default value = '[UNK]')
      do_padding:  (Default value = True)

    Returns:

    """
    args = locals()
    if not labels:
        labels = words
        pad_token_label_id = False

    tokens = []
    label_ids = []
    for word, label in zip(words, labels):
        word_tokens = tokenizer.tokenize(word)
        if not word_tokens:
            # some wired chars cause the tagger to return empty list
            word_tokens = [unk_token] * len(word)
        tokens.extend(word_tokens)
        # Use the real label id for the first token of the word, and padding ids for the remaining tokens
        label_ids.extend([label_map[label] if label_map else True] + [pad_token_label_id] * (len(word_tokens) - 1))

    # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
    special_tokens_count = 3 if sep_token_extra else 2
    if max_seq_length and len(tokens) > max_seq_length - special_tokens_count:
        warnings.warn(
            f'Input tokens {words} exceed the max sequence length of {max_seq_length - special_tokens_count}. '
            f'The exceeded part will be truncated and ignored. '
            f'You are recommended to split your long text into several sentences within '
            f'{max_seq_length - special_tokens_count} tokens beforehand.')
        tokens = tokens[: (max_seq_length - special_tokens_count)]
        label_ids = label_ids[: (max_seq_length - special_tokens_count)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  token_type_ids:   0   0   0   0  0     0   0
    #
    # Where "token_type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens += [sep_token]
    label_ids += [pad_token_label_id]
    if sep_token_extra:
        # roberta uses an extra separator b/w pairs of sentences
        tokens += [sep_token]
        label_ids += [pad_token_label_id]
    segment_ids = [sequence_a_segment_id] * len(tokens)

    if cls_token_at_end:
        tokens += [cls_token]
        label_ids += [pad_token_label_id]
        segment_ids += [cls_token_segment_id]
    else:
        tokens = [cls_token] + tokens
        label_ids = [pad_token_label_id] + label_ids
        segment_ids = [cls_token_segment_id] + segment_ids

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

    if do_padding:
        # Zero-pad up to the sequence length.
        padding_length = max_seq_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token_id] * padding_length) + input_ids
            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
            label_ids = ([pad_token_label_id] * padding_length) + label_ids
        else:
            input_ids += [pad_token_id] * padding_length
            input_mask += [0 if mask_padding_with_zero else 1] * padding_length
            segment_ids += [pad_token_segment_id] * padding_length
            label_ids += [pad_token_label_id] * padding_length

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        assert len(label_ids) == max_seq_length, f'failed for:\n {args}'
    else:
        assert len(set(len(x) for x in [input_ids, input_mask, segment_ids, label_ids])) == 1
    return input_ids, input_mask, segment_ids, label_ids


def main():
    transformer = 'bert-base-uncased'
    tokenizer: PreTrainedTokenizer = AutoTokenizer_.from_pretrained(transformer)
    # _test_text_transform(tokenizer)
    _test_sequence_transform(tokenizer)


def _test_text_transform(tokenizer):
    transform = TransformerTextTokenizer(tokenizer, 'text')
    sample = {'text': 'HanLP good'}
    print(transform(sample))


def _test_sequence_transform(tokenizer):
    transform = TransformerSequenceTokenizer(tokenizer, 'token')
    sample = {'token': 'HanLP good'.split()}
    print(transform(sample))


if __name__ == '__main__':
    main()


================================================
FILE: hanlp/transform/tsv_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-06-13 21:15
import functools
from abc import ABC
from typing import Tuple, Union, Optional, Iterable, List

import tensorflow as tf

from hanlp_common.structure import SerializableDict

from hanlp.common.transform_tf import Transform
from hanlp.common.vocab_tf import VocabTF
from hanlp.utils.io_util import generate_words_tags_from_tsv
from hanlp.utils.tf_util import str_tensor_to_str
from hanlp_common.util import merge_locals_kwargs


def dataset_from_tsv(tsv_file_path, word_vocab: VocabTF, char_vocab: VocabTF, tag_vocab: VocabTF, batch_size=32,
                     shuffle=None, repeat=None, prefetch=1, lower=False, **kwargs):
    generator = functools.partial(generate_words_tags_from_tsv, tsv_file_path, word_vocab, char_vocab, tag_vocab, lower)
    return dataset_from_generator(generator, word_vocab, tag_vocab, batch_size, shuffle, repeat, prefetch,
                                  **kwargs)


def dataset_from_generator(generator, word_vocab, tag_vocab, batch_size=32, shuffle=None, repeat=None, prefetch=1,
                           **kwargs):
    shapes = [None], [None]
    types = tf.string, tf.string
    defaults = word_vocab.pad_token, tag_vocab.pad_token if tag_vocab.pad_token else tag_vocab.first_token
    dataset = tf.data.Dataset.from_generator(generator, output_shapes=shapes, output_types=types)
    if shuffle:
        if isinstance(shuffle, bool):
            shuffle = 1024
        dataset = dataset.shuffle(shuffle)
    if repeat:
        dataset = dataset.repeat(repeat)
    dataset = dataset.padded_batch(batch_size, shapes, defaults).prefetch(prefetch)
    return dataset


def vocab_from_tsv(tsv_file_path, lower=False, lock_word_vocab=False, lock_char_vocab=True, lock_tag_vocab=True) \
        -> Tuple[VocabTF, VocabTF, VocabTF]:
    word_vocab = VocabTF()
    char_vocab = VocabTF()
    tag_vocab = VocabTF(unk_token=None)
    with open(tsv_file_path, encoding='utf-8') as tsv_file:
        for line in tsv_file:
            cells = line.strip().split()
            if cells:
                word, tag = cells
                if lower:
                    word_vocab.add(word.lower())
                else:
                    word_vocab.add(word)
                char_vocab.update(list(word))
                tag_vocab.add(tag)
    if lock_word_vocab:
        word_vocab.lock()
    if lock_char_vocab:
        char_vocab.lock()
    if lock_tag_vocab:
        tag_vocab.lock()
    return word_vocab, char_vocab, tag_vocab


class TsvTaggingFormat(Transform, ABC):
    def file_to_inputs(self, filepath: str, gold=True):
        assert gold, 'TsvTaggingFormat does not support reading non-gold files'
        yield from generate_words_tags_from_tsv(filepath, gold=gold, lower=self.config.get('lower', False),
                                                max_seq_length=self.max_seq_length)

    @property
    def max_seq_length(self):
        return self.config.get('max_seq_length', None)


class TSVTaggingTransform(TsvTaggingFormat, Transform):
    def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, use_char=False, **kwargs) -> None:
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.word_vocab: Optional[VocabTF] = None
        self.tag_vocab: Optional[VocabTF] = None
        self.char_vocab: Optional[VocabTF] = None

    def fit(self, trn_path: str, **kwargs) -> int:
        self.word_vocab = VocabTF()
        self.tag_vocab = VocabTF(pad_token=None, unk_token=None)
        num_samples = 0
        for words, tags in self.file_to_inputs(trn_path, True):
            self.word_vocab.update(words)
            self.tag_vocab.update(tags)
            num_samples += 1
        if self.char_vocab:
            self.char_vocab = VocabTF()
            for word in self.word_vocab.token_to_idx.keys():
                if word in (self.word_vocab.pad_token, self.word_vocab.unk_token):
                    continue
                self.char_vocab.update(list(word))
        return num_samples

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = tf.string, tf.string
        shapes = [None], [None]
        values = self.word_vocab.pad_token, self.tag_vocab.first_token
        return types, shapes, values

    def inputs_to_samples(self, inputs, gold=False):
        lower = self.config.get('lower', False)
        if gold:
            if lower:
                for x, y in inputs:
                    yield x.lower(), y
            else:
                yield from inputs
        else:
            for x in inputs:
                yield x.lower() if lower else x, [self.padding_values[-1]] * len(x)

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        return self.word_vocab.lookup(x)

    def y_to_idx(self, y) -> tf.Tensor:
        return self.tag_vocab.lookup(y)

    def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
        for xs in X:
            words = []
            for x in xs:
                words.append(str_tensor_to_str(x) if self.char_vocab else self.word_vocab.idx_to_token[int(x)])
            yield words

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False,
                     inputs=None, X=None, **kwargs) -> Iterable:
        if not gold:
            Y = tf.argmax(Y, axis=2)
        for ys, xs in zip(Y, inputs):
            tags = []
            for y, x in zip(ys, xs):
                tags.append(self.tag_vocab.idx_to_token[int(y)])
            yield tags

    def input_is_single_sample(self, input: Union[List[str], List[List[str]]]) -> bool:
        return isinstance(input[0], str)

    def input_truth_output_to_str(self, input: List[str], truth: List[str], output: List[str]):
        text = ''
        for word, gold_tag, pred_tag in zip(input, truth, output):
            text += ' '.join([word, gold_tag, pred_tag]) + '\n'

        text += '\n'
        return text


================================================
FILE: hanlp/transform/txt_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-24 15:07
import functools
from abc import ABC
from typing import Tuple, Union, List, Iterable

import tensorflow as tf

from hanlp.common.transform_tf import Transform
from hanlp.common.vocab_tf import VocabTF
from hanlp.utils.io_util import get_resource
from hanlp.utils.lang.zh.char_table import CharTable
from hanlp.utils.span_util import bmes_of, bmes_to_words
from hanlp.utils.string_util import split_long_sent


def generate_words_per_line(file_path):
    with open(file_path, encoding='utf-8') as src:
        for line in src:
            cells = line.strip().split()
            if not cells:
                continue
            yield cells


def words_to_bmes(words):
    tags = []
    for w in words:
        if not w:
            raise ValueError('{} contains None or zero-length word {}'.format(str(words), w))
        if len(w) == 1:
            tags.append('S')
        else:
            tags.extend(['B'] + ['M'] * (len(w) - 2) + ['E'])
    return tags


def extract_ngram_features_and_tags(sentence, bigram_only=False, window_size=4, segmented=True):
    """
    Feature extraction for windowed approaches
    See Also https://github.com/chqiwang/convseg/
    Parameters
    ----------
    sentence
    bigram_only
    window_size
    segmented

    Returns
    -------

    """
    chars, tags = bmes_of(sentence, segmented)
    chars = CharTable.normalize_chars(chars)
    ret = []
    ret.append(chars)
    # TODO: optimize ngram generation using https://www.tensorflow.org/api_docs/python/tf/strings/ngrams
    ret.extend(extract_ngram_features(chars, bigram_only, window_size))
    ret.append(tags)
    return tuple(ret[:-1]), ret[-1]  # x, y


def extract_ngram_features(chars, bigram_only, window_size):
    ret = []
    if bigram_only:
        chars = ['', ''] + chars + ['', '']
        ret.append([a + b if a and b else '' for a, b in zip(chars[:-4], chars[1:])])
        ret.append([a + b if a and b else '' for a, b in zip(chars[1:-3], chars[2:])])
        ret.append([a + b if a and b else '' for a, b in zip(chars[2:-2], chars[3:])])
        ret.append([a + b if a and b else '' for a, b in zip(chars[3:-1], chars[4:])])
    elif window_size > 0:
        chars = ['', '', ''] + chars + ['', '', '']
        # single char
        if window_size >= 1:
            ret.append(chars[3:-3])
        if window_size >= 2:
            # bi chars
            ret.append([a + b if a and b else '' for a, b in zip(chars[2:], chars[3:-3])])
            ret.append([a + b if a and b else '' for a, b in zip(chars[3:-3], chars[4:])])
        if window_size >= 3:
            # tri chars
            ret.append(
                [a + b + c if a and b and c else '' for a, b, c in zip(chars[1:], chars[2:], chars[3:-3])])
            ret.append(
                [a + b + c if a and b and c else '' for a, b, c in zip(chars[2:], chars[3:-3], chars[4:])])
            ret.append(
                [a + b + c if a and b and c else '' for a, b, c in zip(chars[3:-3], chars[4:], chars[5:])])
        if window_size >= 4:
            # four chars
            ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in
                        zip(chars[0:], chars[1:], chars[2:], chars[3:-3])])
            ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in
                        zip(chars[1:], chars[2:], chars[3:-3], chars[4:])])
            ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in
                        zip(chars[2:], chars[3:-3], chars[4:], chars[5:])])
            ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in
                        zip(chars[3:-3], chars[4:], chars[5:], chars[6:])])
    return ret


def generate_ngram_bmes(file_path, bigram_only=False, window_size=4, gold=True):
    with open(file_path, encoding='utf-8') as src:
        for line in src:
            sentence = line.strip()
            if not sentence:
                continue
            yield extract_ngram_features_and_tags(sentence, bigram_only, window_size, gold)


def vocab_from_txt(txt_file_path, bigram_only=False, window_size=4, **kwargs) -> Tuple[VocabTF, VocabTF, VocabTF]:
    char_vocab, ngram_vocab, tag_vocab = VocabTF(), VocabTF(), VocabTF(pad_token=None, unk_token=None)
    for X, Y in generate_ngram_bmes(txt_file_path, bigram_only, window_size, gold=True):
        char_vocab.update(X[0])
        for ngram in X[1:]:
            ngram_vocab.update(filter(lambda x: x, ngram))
        tag_vocab.update(Y)
    return char_vocab, ngram_vocab, tag_vocab


def dataset_from_txt(txt_file_path: str, char_vocab: VocabTF, ngram_vocab: VocabTF, tag_vocab: VocabTF,
                     bigram_only=False,
                     window_size=4, segmented=True, batch_size=32, shuffle=None, repeat=None, prefetch=1):
    generator = functools.partial(generate_ngram_bmes, txt_file_path, bigram_only, window_size, segmented)
    return dataset_from_generator(generator, char_vocab, ngram_vocab, tag_vocab, bigram_only, window_size, batch_size,
                                  shuffle, repeat, prefetch)


def dataset_from_generator(generator, char_vocab, ngram_vocab, tag_vocab, bigram_only=False, window_size=4,
                           batch_size=32, shuffle=None, repeat=None, prefetch=1):
    if bigram_only:
        ngram_size = 4
    else:
        ngram_size = window_size * (window_size + 1) // 2
    vec_dim = 2 + ngram_size
    shapes = tuple([[None]] * (vec_dim - 1)), [None]
    types = tuple([tf.string] * (vec_dim - 1)), tf.string
    defaults = tuple([char_vocab.pad_token] + [
        ngram_vocab.pad_token if ngram_vocab else char_vocab.pad_token] * ngram_size), (
                   tag_vocab.pad_token if tag_vocab.pad_token else tag_vocab.first_token)
    dataset = tf.data.Dataset.from_generator(generator, output_shapes=shapes, output_types=types)
    if shuffle:
        if isinstance(shuffle, bool):
            shuffle = 1024
        dataset = dataset.shuffle(shuffle)
    if repeat:
        dataset = dataset.repeat(repeat)
    dataset = dataset.padded_batch(batch_size, shapes, defaults).prefetch(prefetch)
    return dataset


class TxtFormat(Transform, ABC):
    def file_to_inputs(self, filepath: str, gold=True):
        filepath = get_resource(filepath)
        with open(filepath, encoding='utf-8') as src:
            for line in src:
                sentence = line.strip()
                if not sentence:
                    continue
                yield sentence


class TxtBMESFormat(TxtFormat, ABC):
    def file_to_inputs(self, filepath: str, gold=True):
        max_seq_length = self.config.get('max_seq_length', False)
        if max_seq_length:
            if 'transformer' in self.config:
                max_seq_length -= 2  # allow for [CLS] and [SEP]
            delimiter = set()
            delimiter.update('。！？：；、，,;!?、,')
        for text in super().file_to_inputs(filepath, gold):
            chars, tags = bmes_of(text, gold)
            if max_seq_length:
                start = 0
                for short_chars in split_long_sent(chars, delimiter, max_seq_length):
                    end = start + len(short_chars)
                    yield short_chars, tags[start:end]
                    start = end
            else:
                yield chars, tags

    def input_is_single_sample(self, input: Union[List[str], List[List[str]]]) -> bool:
        return isinstance(input, str)

    def inputs_to_samples(self, inputs, gold=False):
        for chars, tags in (inputs if gold else zip(inputs, [None] * len(inputs))):
            if not gold:
                tags = [self.tag_vocab.safe_pad_token] * len(chars)
            chars = CharTable.normalize_chars(chars)
            yield chars, tags

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None,
                     batch=None) -> Iterable:
        yield from self.Y_to_tokens(self.tag_vocab, Y, gold, inputs)

    def Y_to_tokens(self, tag_vocab, Y, gold, inputs):
        if not gold:
            Y = tf.argmax(Y, axis=2)
        for text, ys in zip(inputs, Y):
            tags = [tag_vocab.idx_to_token[int(y)] for y in ys[:len(text)]]
            yield bmes_to_words(list(text), tags)


================================================
FILE: hanlp/utils/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-24 22:12
from . import rules


def ls_resource_in_module(root) -> dict:
    res = dict()
    for k, v in root.__dict__.items():
        if k.startswith('_') or v == root:
            continue
        if isinstance(v, str):
            if v.startswith('http') and not v.endswith('/') and not v.endswith('#') and not v.startswith('_'):
                res[k] = v
        elif type(v).__name__ == 'module':
            res.update(ls_resource_in_module(v))
    if 'ALL' in root.__dict__ and isinstance(root.__dict__['ALL'], dict):
        root.__dict__['ALL'].update(res)
    return res


================================================
FILE: hanlp/utils/component_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-31 19:24
import os
from hanlp_common.constant import HANLP_VERBOSE
from hanlp_common.io import load_json, eprint, save_json
from hanlp_common.reflection import object_from_classpath, str_to_type
from hanlp import pretrained
from hanlp import version
from hanlp.common.component import Component
from hanlp.utils.io_util import get_resource, get_latest_info_from_pypi, check_version_conflicts
from hanlp_common.util import isdebugging


def load_from_meta_file(save_dir: str, meta_filename='meta.json', transform_only=False, verbose=HANLP_VERBOSE,
                        **kwargs) -> Component:
    """
    Load a component from a ``meta.json`` (legacy TensorFlow component) or a ``config.json`` file.

    Args:
        save_dir: The identifier.
        meta_filename (str): The meta file of that saved component, which stores the classpath and version.
        transform_only: Load and return only the transform.
        **kwargs: Extra parameters passed to ``component.load()``.

    Returns:

        A component.
    """
    identifier = save_dir
    load_path = save_dir
    save_dir = get_resource(save_dir)
    if save_dir.endswith('.json'):
        meta_filename = os.path.basename(save_dir)
        save_dir = os.path.dirname(save_dir)
    metapath = os.path.join(save_dir, meta_filename)
    if not os.path.isfile(metapath):
        tf_model = False
        metapath = os.path.join(save_dir, 'config.json')
    else:
        tf_model = True
    cls = None
    if not os.path.isfile(metapath):
        tips = ''
        if save_dir.isupper():
            from difflib import SequenceMatcher
            similar_keys = sorted(pretrained.ALL.keys(),
                                  key=lambda k: SequenceMatcher(None, k, identifier).ratio(),
                                  reverse=True)[:5]
            tips = f'Check its spelling based on the available keys:\n' + \
                   f'{sorted(pretrained.ALL.keys())}\n' + \
                   f'Tips: it might be one of {similar_keys}'
        # These components are not intended to be loaded in this way, but I'm tired of explaining it again and again
        if identifier in pretrained.word2vec.ALL.values():
            save_dir = os.path.dirname(save_dir)
            metapath = os.path.join(save_dir, 'config.json')
            save_json({'classpath': 'hanlp.layers.embeddings.word2vec.Word2VecEmbeddingComponent',
                       'embed': {'classpath': 'hanlp.layers.embeddings.word2vec.Word2VecEmbedding',
                                 'embed': identifier, 'field': 'token', 'normalize': 'l2'},
                       'hanlp_version': version.__version__}, metapath)
        elif identifier in pretrained.fasttext.ALL.values():
            save_dir = os.path.dirname(save_dir)
            metapath = os.path.join(save_dir, 'config.json')
            save_json({'classpath': 'hanlp.layers.embeddings.fast_text.FastTextEmbeddingComponent',
                       'embed': {'classpath': 'hanlp.layers.embeddings.fast_text.FastTextEmbedding',
                                 'filepath': identifier, 'src': 'token'},
                       'hanlp_version': version.__version__}, metapath)
        elif identifier in {pretrained.classifiers.LID_176_FASTTEXT_SMALL,
                            pretrained.classifiers.LID_176_FASTTEXT_BASE}:
            save_dir = os.path.dirname(save_dir)
            metapath = os.path.join(save_dir, 'config.json')
            save_json({'classpath': 'hanlp.components.classifiers.fasttext_classifier.FastTextClassifier',
                       'model_path': identifier,
                       'hanlp_version': version.__version__}, metapath)
        else:
            raise FileNotFoundError(f'The identifier {save_dir} resolves to a nonexistent meta file {metapath}. {tips}')
    meta: dict = load_json(metapath)
    cls = meta.get('classpath', cls)
    if not cls:
        cls = meta.get('class_path', None)  # For older version
    if tf_model:
        # tf models are trained with version < 2.1. To migrate them to 2.1, map their classpath to new locations
        upgrade = {
            'hanlp.components.tok_tf.TransformerTokenizerTF': 'hanlp.components.tokenizers.tok_tf.TransformerTokenizerTF',
            'hanlp.components.pos.RNNPartOfSpeechTagger': 'hanlp.components.taggers.pos_tf.RNNPartOfSpeechTaggerTF',
            'hanlp.components.pos_tf.RNNPartOfSpeechTaggerTF': 'hanlp.components.taggers.pos_tf.RNNPartOfSpeechTaggerTF',
            'hanlp.components.pos_tf.CNNPartOfSpeechTaggerTF': 'hanlp.components.taggers.pos_tf.CNNPartOfSpeechTaggerTF',
            'hanlp.components.ner_tf.TransformerNamedEntityRecognizerTF': 'hanlp.components.ner.ner_tf.TransformerNamedEntityRecognizerTF',
            'hanlp.components.parsers.biaffine_parser.BiaffineDependencyParser': 'hanlp.components.parsers.biaffine_parser_tf.BiaffineDependencyParserTF',
            'hanlp.components.parsers.biaffine_parser.BiaffineSemanticDependencyParser': 'hanlp.components.parsers.biaffine_parser_tf.BiaffineSemanticDependencyParserTF',
            'hanlp.components.tok_tf.NgramConvTokenizerTF': 'hanlp.components.tokenizers.tok_tf.NgramConvTokenizerTF',
            'hanlp.components.classifiers.transformer_classifier.TransformerClassifier': 'hanlp.components.classifiers.transformer_classifier_tf.TransformerClassifierTF',
            'hanlp.components.taggers.transformers.transformer_tagger.TransformerTagger': 'hanlp.components.taggers.transformers.transformer_tagger_tf.TransformerTaggerTF',
            'hanlp.components.tok.NgramConvTokenizer': 'hanlp.components.tokenizers.tok_tf.NgramConvTokenizerTF',
        }
        cls = upgrade.get(cls, cls)
    assert cls, f'{meta_filename} doesn\'t contain classpath field'
    try:
        obj: Component = object_from_classpath(cls)
        if hasattr(obj, 'load'):
            if transform_only:
                # noinspection PyUnresolvedReferences
                obj.load_transform(save_dir)
            else:
                if os.path.isfile(os.path.join(save_dir, 'config.json')):
                    obj.load(save_dir, verbose=verbose, **kwargs)
                else:
                    obj.load(metapath, **kwargs)
            obj.config['load_path'] = load_path
        return obj
    except ModuleNotFoundError as e:
        if isdebugging():
            raise e from None
        else:
            raise ModuleNotFoundError(
                f'Some modules ({e.name} etc.) required by this model are missing. Please install the full version:'
                '\n\n\tpip install hanlp[full] -U') from None
    except ValueError as e:
        if e.args and isinstance(e.args[0], str) and 'Internet connection' in e.args[0]:
            raise ConnectionError(
                'Hugging Face 🤗 Transformers failed to download because your Internet connection is either off or bad.\n'
                'See https://hanlp.hankcs.com/docs/install.html#server-without-internet for solutions.') \
                from None
        raise e from None
    except Exception as e:
        # Some users often install an incompatible tf and put the blame on HanLP. Teach them the basics.
        try:
            you_installed_wrong_versions, extras = check_version_conflicts(extras=('full',) if tf_model else None)
        except Exception as check_e:
            you_installed_wrong_versions, extras = None, None
        if you_installed_wrong_versions:
            raise version.NotCompatible(you_installed_wrong_versions + '\nPlease reinstall HanLP in the proper way:' +
                                        '\n\n\tpip install --upgrade hanlp' + (
                                            f'[{",".join(extras)}]' if extras else '')) from None
        eprint(f'Failed to load {identifier}')
        from pkg_resources import parse_version
        model_version = meta.get("hanlp_version", '2.0.0-alpha.0')
        if model_version == '2.0.0':  # Quick fix: the first version used a wrong string
            model_version = '2.0.0-alpha.0'
        model_version = parse_version(model_version)
        installed_version = parse_version(version.__version__)
        try:
            latest_version = get_latest_info_from_pypi()
        except:
            latest_version = None
        if model_version > installed_version:
            eprint(f'{identifier} was created with hanlp-{model_version}, '
                   f'while you are running a lower version: {installed_version}. ')
        if installed_version != latest_version:
            eprint(
                f'Please upgrade HanLP with:\n'
                f'\n\tpip install --upgrade hanlp\n')
        eprint(
            'If the problem still persists, please submit an issue to https://github.com/hankcs/HanLP/issues\n'
            'When reporting an issue, make sure to paste the FULL ERROR LOG below.')

        eprint(f'{"ERROR LOG BEGINS":=^80}')
        import platform
        eprint(f'OS: {platform.platform()}')
        eprint(f'Python: {platform.python_version()}')
        import torch
        eprint(f'PyTorch: {torch.__version__}')
        if tf_model:
            try:
                import tensorflow
                tf_version = tensorflow.__version__
                eprint(f'TensorFlow: {tf_version}')
            except ModuleNotFoundError:
                tf_version = 'not installed'
                eprint(f'TensorFlow: {tf_version}')
            except Exception as tf_e:
                eprint(f'TensorFlow cannot be imported due to {tf_e.__class__.__name__}: {e}. '
                       f'Note this is not a bug of HanLP, but rather a compatability issue caused by TensorFlow.')
        eprint(f'HanLP: {version.__version__}')
        import sys
        sys.stderr.flush()
        try:
            if e.args and isinstance(e.args, tuple):
                for i in range(len(e.args)):
                    if isinstance(e.args[i], str):
                        from hanlp_common.util import set_tuple_with
                        e.args = set_tuple_with(e.args, e.args[i] + f'\n{"ERROR LOG ENDS":=^80}', i)
                        break
        except:
            pass
        raise e from None


def load_from_meta(meta: dict) -> Component:
    if 'load_path' in meta:
        return load_from_meta_file(meta['load_path'])
    cls = meta.get('class_path', None) or meta.get('classpath', None)
    assert cls, f'{meta} doesn\'t contain classpath field'
    cls = str_to_type(cls)
    return cls.from_config(meta)


================================================
FILE: hanlp/utils/file_read_backwards/__init__.py
================================================
# -*- coding: utf-8 -*-

from .file_read_backwards import FileReadBackwards  # noqa: F401

__author__ = """Robin Robin"""
__email__ = 'robinsquare42@gmail.com'
__version__ = '2.0.0'


================================================
FILE: hanlp/utils/file_read_backwards/buffer_work_space.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""BufferWorkSpace module."""

import os

new_lines = ["\r\n", "\n", "\r"]
new_lines_bytes = [n.encode("ascii") for n in new_lines]  # we only support encodings that's backward compat with ascii


class BufferWorkSpace:

    """It is a helper module for FileReadBackwards."""

    def __init__(self, fp, chunk_size):
        """Convention for the data.

        When read_buffer is not None, it represents contents of the file from `read_position` onwards
            that has not been processed/returned.
        read_position represents the file pointer position that has been read into read_buffer
            initialized to be just past the end of file.
        """
        self.fp = fp
        self.read_position = _get_file_size(self.fp)  # set the previously read position to the
        self.read_buffer = None
        self.chunk_size = chunk_size

    def add_to_buffer(self, content, read_position):
        """Add additional bytes content as read from the read_position.

        Args:
          content(bytes): data to be added to buffer working BufferWorkSpac.
          read_position(int): where in the file pointer the data was read from.

        Returns:

        """
        self.read_position = read_position
        if self.read_buffer is None:
            self.read_buffer = content
        else:
            self.read_buffer = content + self.read_buffer

    def yieldable(self):
        """ """
        if self.read_buffer is None:
            return False

        t = _remove_trailing_new_line(self.read_buffer)
        n = _find_furthest_new_line(t)
        if n >= 0:
            return True

        # we have read in entire file and have some unprocessed lines
        if self.read_position == 0 and self.read_buffer is not None:
            return True
        return False

    def return_line(self):
        """

        Args:

        Returns:
          Precondition: self.yieldable() must be True

        """
        assert(self.yieldable())

        t = _remove_trailing_new_line(self.read_buffer)
        i = _find_furthest_new_line(t)

        if i >= 0:
            l = i + 1
            after_new_line = slice(l, None)
            up_to_include_new_line = slice(0, l)
            r = t[after_new_line]
            self.read_buffer = t[up_to_include_new_line]
        else:  # the case where we have read in entire file and at the "last" line
            r = t
            self.read_buffer = None
        return r

    def read_until_yieldable(self):
        """Read in additional chunks until it is yieldable."""
        while not self.yieldable():
            read_content, read_position = _get_next_chunk(self.fp, self.read_position, self.chunk_size)
            self.add_to_buffer(read_content, read_position)

    def has_returned_every_line(self):
        """ """
        if self.read_position == 0 and self.read_buffer is None:
            return True
        return False


def _get_file_size(fp):
    return os.fstat(fp.fileno()).st_size


def _get_next_chunk(fp, previously_read_position, chunk_size):
    """Return next chunk of data that we would from the file pointer.

    Args:
      fp: file
      previously_read_position: file pointer position that we have read from
      chunk_size: desired read chunk_size

    Returns:
      (bytestring, int): data that has been read in, the file pointer position where the data has been read from

    """
    seek_position, read_size = _get_what_to_read_next(fp, previously_read_position, chunk_size)
    fp.seek(seek_position)
    read_content = fp.read(read_size)
    read_position = seek_position
    return read_content, read_position


def _get_what_to_read_next(fp, previously_read_position, chunk_size):
    """Return information on which file pointer position to read from and how many bytes.

    Args:
      fp: 
      past_read_positon: int
      chunk_size: int
      previously_read_position: 

    Returns:
      (int, int): The next seek position, how many bytes to read next

    """
    seek_position = max(previously_read_position - chunk_size, 0)
    read_size = chunk_size

    # examples: say, our new_lines are potentially "\r\n", "\n", "\r"
    # find a reading point where it is not "\n", rewind further if necessary
    # if we have "\r\n" and we read in "\n",
    # the next iteration would treat "\r" as a different new line.
    # Q: why don't I just check if it is b"\n", but use a function ?
    # A: so that we can potentially expand this into generic sets of separators, later on.
    while seek_position > 0:
        fp.seek(seek_position)
        if _is_partially_read_new_line(fp.read(1)):
            seek_position -= 1
            read_size += 1  # as we rewind further, let's make sure we read more to compensate
        else:
            break

    # take care of special case when we are back to the beginnin of the file
    read_size = min(previously_read_position - seek_position, read_size)
    return seek_position, read_size


def _remove_trailing_new_line(l):
    """Remove a single instance of new line at the end of l if it exists.

    Args:
      l: 

    Returns:
      : bytestring

    """
    # replace only 1 instance of newline
    # match longest line first (hence the reverse=True), we want to match "\r\n" rather than "\n" if we can
    for n in sorted(new_lines_bytes, key=lambda x: len(x), reverse=True):
        if l.endswith(n):
            remove_new_line = slice(None, -len(n))
            return l[remove_new_line]
    return l


def _find_furthest_new_line(read_buffer):
    """Return -1 if read_buffer does not contain new line otherwise the position of the rightmost newline.

    Args:
      read_buffer: bytestring

    Returns:
      int: The right most position of new line character in read_buffer if found, else -1

    """
    new_line_positions = [read_buffer.rfind(n) for n in new_lines_bytes]
    return max(new_line_positions)


def _is_partially_read_new_line(b):
    """Return True when b is part of a new line separator found at index >= 1, False otherwise.

    Args:
      b: bytestring

    Returns:
      bool

    """
    for n in new_lines_bytes:
        if n.find(b) >= 1:
            return True
    return False


================================================
FILE: hanlp/utils/file_read_backwards/file_read_backwards.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""FileReadBackwards module."""

import io
import os

from .buffer_work_space import BufferWorkSpace

supported_encodings = ["utf-8", "ascii", "latin-1"]  # any encodings that are backward compatible with ascii should work


class FileReadBackwards:

    """Class definition for `FileReadBackwards`.
    
    A `FileReadBackwards` will spawn a `FileReadBackwardsIterator` and keep an opened file handler.
    
    It can be used as a Context Manager. If done so, when exited, it will close its file handler.
    
    In any mode, `close()` can be called to close the file handler..

    Args:

    Returns:

    """

    def __init__(self, path, encoding="utf-8", chunk_size=io.DEFAULT_BUFFER_SIZE):
        """Constructor for FileReadBackwards.

        Args:
            path: Path to the file to be read
            encoding (str): Encoding
            chunk_size (int): How many bytes to read at a time
        """
        if encoding.lower() not in supported_encodings:
            error_message = "{0} encoding was not supported/tested.".format(encoding)
            error_message += "Supported encodings are '{0}'".format(",".join(supported_encodings))
            raise NotImplementedError(error_message)

        self.path = path
        self.encoding = encoding.lower()
        self.chunk_size = chunk_size
        self.iterator = FileReadBackwardsIterator(io.open(self.path, mode="rb"), self.encoding, self.chunk_size)

    def __iter__(self):
        """Return its iterator."""
        return self.iterator

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Closes all opened its file handler and propagates all exceptions on exit."""
        self.close()
        return False

    def close(self):
        """Closes all opened it s file handler."""
        self.iterator.close()

    def readline(self):
        """ """

        try:
            r = next(self.iterator) + os.linesep
            return r
        except StopIteration:
            return ""


class FileReadBackwardsIterator:
    """Iterator for `FileReadBackwards`.
    
    This will read backwards line by line a file. It holds an opened file handler.

    Args:

    Returns:

    """
    def __init__(self, fp, encoding, chunk_size):
        """Constructor for FileReadBackwardsIterator

        Args:
            fp (File): A file that we wish to start reading backwards from
            encoding (str): Encoding of the file
            chunk_size (int): How many bytes to read at a time
        """
        self.path = fp.name
        self.encoding = encoding
        self.chunk_size = chunk_size
        self.__fp = fp
        self.__buf = BufferWorkSpace(self.__fp, self.chunk_size)

    def __iter__(self):
        return self

    def next(self):
        """Returns unicode string from the last line until the beginning of file.
        
        Gets exhausted if::
        
            * already reached the beginning of the file on previous iteration
            * the file got closed
        
        When it gets exhausted, it closes the file handler.

        Args:

        Returns:

        """
        # Using binary mode, because some encodings such as "utf-8" use variable number of
        # bytes to encode different Unicode points.
        # Without using binary mode, we would probably need to understand each encoding more
        # and do the seek operations to find the proper boundary before issuing read
        if self.closed:
            raise StopIteration
        if self.__buf.has_returned_every_line():
            self.close()
            raise StopIteration
        self.__buf.read_until_yieldable()
        r = self.__buf.return_line()
        return r.decode(self.encoding)

    __next__ = next

    @property
    def closed(self):
        """The status of the file handler.
        
        :return: True if the file handler is still opened. False otherwise.

        Args:

        Returns:

        """
        return self.__fp.closed

    def close(self):
        """Closes the file handler."""
        self.__fp.close()


================================================
FILE: hanlp/utils/init_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-27 13:25
import math

import torch
from torch import nn
import functools


def embedding_uniform(tensor:torch.Tensor, seed=233):
    gen = torch.Generator().manual_seed(seed)
    with torch.no_grad():
        fan_out = tensor.size(-1)
        bound = math.sqrt(3.0 / fan_out)
        return tensor.uniform_(-bound, bound, generator=gen)


================================================
FILE: hanlp/utils/io_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-26 15:02
import contextlib
import glob
import gzip
import json
import logging
import os
import platform
import random
import shlex
import shutil
import sys
import tarfile
import tempfile
import urllib
import zipfile
from contextlib import contextmanager
from pathlib import Path
from subprocess import Popen, PIPE
from typing import Tuple, Optional, Union, List
from urllib.parse import urlparse
from urllib.request import urlretrieve

from hanlp_downloader import Downloader
from hanlp_downloader.log import DownloadCallback
from packaging.version import Version

import hanlp
from hanlp_common.constant import HANLP_URL, HANLP_VERBOSE
from hanlp.utils.log_util import logger, cprint, remove_color_tag
from hanlp.utils.string_util import split_long_sentence_into
from hanlp.utils.time_util import now_filename, CountdownTimer
from hanlp.version import __version__
from hanlp_common.io import eprint


def load_jsonl(path, verbose=False):
    if verbose:
        src = TimingFileIterator(path)
    else:
        src = open(path, encoding='utf-8')
    for line in src:
        yield json.loads(line)
    if not verbose:
        src.close()


def make_debug_corpus(path, delimiter=None, percentage=0.1, max_samples=100):
    files = []
    if os.path.isfile(path):
        files.append(path)
    elif os.path.isdir(path):
        files += [os.path.join(path, f) for f in os.listdir(path) if
                  os.path.isfile(os.path.join(path, f)) and '.debug' not in f and not f.startswith('.')]
    else:
        raise FileNotFoundError(path)
    for filepath in files:
        filename, file_extension = os.path.splitext(filepath)
        if not delimiter:
            if file_extension in {'.tsv', '.conll', '.conllx', '.conllu'}:
                delimiter = '\n\n'
            else:
                delimiter = '\n'
        with open(filepath, encoding='utf-8') as src, open(filename + '.debug' + file_extension, 'w',
                                                           encoding='utf-8') as out:
            samples = src.read().strip().split(delimiter)
            max_samples = min(max_samples, int(len(samples) * percentage))
            out.write(delimiter.join(samples[:max_samples]))


def path_join(path, *paths):
    return os.path.join(path, *paths)


def makedirs(path):
    os.makedirs(path, exist_ok=True)
    return path


def tempdir(name=None):
    path = tempfile.gettempdir()
    if name:
        path = makedirs(path_join(path, name))
    return path


def tempdir_human():
    return tempdir(now_filename())


def temp_lock(path):
    from filelock import FileLock
    import hashlib
    lock = FileLock(f"{tempdir()}/.{hashlib.md5(path.encode('utf8')).hexdigest()}.lock")
    return lock


def hanlp_home_default():
    """Default data directory depending on the platform and environment variables"""
    if windows():
        return os.path.join(os.environ.get('APPDATA'), 'hanlp')
    else:
        return os.path.join(os.path.expanduser("~"), '.hanlp')


def windows():
    system = platform.system()
    return system == 'Windows'


def hanlp_home():
    """ Home directory for HanLP resources.

    Returns:
        Data directory in the filesystem for storage, for example when downloading models.

    This home directory can be customized with the following shell command or equivalent environment variable on Windows
    systems.

    .. highlight:: bash
    .. code-block:: bash

        $ export HANLP_HOME=/data/hanlp

    """
    return os.getenv('HANLP_HOME', hanlp_home_default())


def file_exist(filename) -> bool:
    return os.path.isfile(filename)


def remove_file(filename):
    if file_exist(filename):
        os.remove(filename)


def parent_dir(path):
    return os.path.normpath(os.path.join(path, os.pardir))


def download(url, save_path=None, save_dir=hanlp_home(), prefix=HANLP_URL, append_location=True, verbose=HANLP_VERBOSE):
    if not save_path:
        save_path = path_from_url(url, save_dir, prefix, append_location)
    if os.path.isfile(save_path):
        if verbose:
            eprint('Using local {}, ignore {}'.format(save_path, url))
        return save_path
    else:
        makedirs(parent_dir(save_path))
        if verbose:
            eprint('Downloading {} to {}'.format(url, save_path))
        tmp_path = '{}.downloading'.format(save_path)
        remove_file(tmp_path)
        try:
            downloader = Downloader(url, tmp_path, 4, headers={
                'User-agent': f'HanLP/{__version__} ({platform.platform()})'})
            if verbose:
                downloader.subscribe(DownloadCallback(show_header=False))
            downloader.start_sync()
        except BaseException as e:
            remove_file(tmp_path)
            url = url.split('#')[0]
            try:
                installed_version, latest_version = check_outdated()
            except:
                installed_version, latest_version = None, None  # No Internet
            if installed_version != latest_version:
                # Always prompt user to upgrade whenever a new version is available
                hints = f'[green]Please upgrade to the latest version ({latest_version}) with:[/green]' \
                        f'\n\n\t[yellow]pip install -U hanlp[/yellow]\n'
            else:  # Otherwise, prompt user to re-try
                hints = f'[green]Please re-try or download it to {save_path} by yourself '
                if not windows():
                    hints += f'with:[/green]\n\n\t[yellow]wget {url} -O {save_path}[/yellow]\n\n'
                else:
                    hints += 'using some decent downloading tools.[/green]\n'
                if not url.startswith(HANLP_URL):
                    hints += 'For third party data, unrestricted connectivity to the global network may be required.'
                else:
                    hints += 'See also https://hanlp.hankcs.com/docs/install.html#install-models for instructions.'
            message = f'Download failed due to [red]{repr(e)}[/red].\n' \
                      f'{hints}'
            if verbose:
                cprint(message)
            if hasattr(e, 'msg'):
                e.msg += '\n' + remove_color_tag(message)
            elif hasattr(e, 'args') and e.args and isinstance(e.args, tuple) and isinstance(e.args[0], str):
                e.args = (e.args[0] + '\n' + remove_color_tag(message),) + e.args[1:]
            raise e from None
        remove_file(save_path)
        os.rename(tmp_path, save_path)
    return save_path


def parse_url_path(url):
    parsed: urllib.parse.ParseResult = urlparse(url)
    path = parsed.path.strip('/')
    return parsed.netloc, path


def uncompress(path, dest=None, remove=True, verbose=HANLP_VERBOSE):
    """Uncompress a file and clean up uncompressed files once an error is triggered.

    Args:
      path: The path to a compressed file
      dest: The dest folder.
      remove: Remove archive file after decompression.
      verbose: ``True`` to print log message.

    Returns:
        Destination path.
    
    """
    # assert path.endswith('.zip')
    prefix, ext = split_if_compressed(path)
    folder_name = os.path.basename(prefix)
    file_is_zip = ext == '.zip'
    root_of_folder = None
    if ext == '.gz':
        try:
            with gzip.open(path, 'rb') as f_in, open(prefix, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        except Exception as e:
            remove_file(prefix)
            remove_file(path)
            raise e
    else:
        try:
            with zipfile.ZipFile(path, "r") if ext == '.zip' else tarfile.open(path, 'r:*') as archive:
                if not dest:
                    namelist = sorted(archive.namelist() if file_is_zip else archive.getnames())
                    if namelist[0] == '.':
                        namelist = namelist[1:]
                        namelist = [p[len('./'):] if p.startswith('./') else p for p in namelist]
                    if ext == '.tgz':
                        roots = set(x.split('/')[0] for x in namelist)
                        if len(roots) == 1:
                            root_of_folder = next(iter(roots))
                    else:
                        # only one file, root_of_folder = ''
                        root_of_folder = namelist[0].strip('/') if len(namelist) > 1 else ''
                    if all(f.split('/')[0] == root_of_folder for f in namelist[1:]) or not root_of_folder:
                        dest = os.path.dirname(path)  # only one folder, unzip to the same dir
                    else:
                        root_of_folder = None
                        dest = prefix  # assume zip contains more than one file or folder
                if verbose:
                    eprint('Decompressing {} to {}'.format(path, dest))
                archive.extractall(dest)
                if root_of_folder:
                    if root_of_folder != folder_name:
                        # move root to match folder name
                        os.rename(path_join(dest, root_of_folder), path_join(dest, folder_name))
                    dest = path_join(dest, folder_name)
                elif len(namelist) == 1:
                    dest = path_join(dest, namelist[0])
        except Exception as e:
            remove_file(path)
            if os.path.exists(prefix):
                if os.path.isfile(prefix):
                    os.remove(prefix)
                elif os.path.isdir(prefix):
                    shutil.rmtree(prefix)
            raise e
    if remove:
        remove_file(path)
    return dest


def split_if_compressed(path: str, compressed_ext=('.zip', '.tgz', '.gz', 'bz2', '.xz')) -> Tuple[str, Optional[str]]:
    tar_gz = '.tar.gz'
    if path.endswith(tar_gz):
        root, ext = path[:-len(tar_gz)], tar_gz
    else:
        root, ext = os.path.splitext(path)
    if ext in compressed_ext or ext == tar_gz:
        return root, ext
    return path, None


def get_resource(path: str, save_dir=hanlp_home(), extract=True, prefix=HANLP_URL, append_location=True,
                 verbose=HANLP_VERBOSE):
    """Fetch real (local) path for a resource (model, corpus, whatever) to ``save_dir``.

    Args:
      path: A local path (which will returned as is) or a remote URL (which will be downloaded, decompressed then
        returned).
      save_dir: Where to store the resource (Default value = :meth:`hanlp.utils.io_util.hanlp_home`)
      extract: Whether to unzip it if it's a zip file (Default value = True)
      prefix: A prefix when matched with an URL (path), then that URL is considered to be official. For official
        resources, they will not go to a folder called ``thirdparty`` under :const:`~hanlp_common.constants.HANLP_HOME`.
      append_location: Whether to put unofficial files in a ``thirdparty`` folder.
      verbose: Whether to print log messages.

    Returns:
      The real path to the resource.

    """
    _path = path
    path = hanlp.pretrained.ALL.get(path, path)
    anchor: str = None
    compressed = None
    if os.path.isdir(path):
        return path
    elif os.path.isfile(path):
        pass
    elif path.startswith('http:') or path.startswith('https:'):
        url = path
        if '#' in url:
            url, anchor = url.split('#', maxsplit=1)
        realpath = path_from_url(path, save_dir, prefix, append_location)
        realpath, compressed = split_if_compressed(realpath)
        # check if resource is there
        if anchor:
            if anchor.startswith('/'):
                # indicates the folder name has to be polished
                anchor = anchor.lstrip('/')
                parts = anchor.split('/')
                renamed_realpath = str(Path(realpath).parent.joinpath(parts[0]))
                if os.path.isfile(realpath + compressed):
                    os.rename(realpath + compressed, renamed_realpath + compressed)
                realpath = renamed_realpath
                anchor = '/'.join(parts[1:])
            child = path_join(realpath, anchor)
            if os.path.exists(child):
                return child
        elif os.path.isdir(realpath) or (os.path.isfile(realpath) and (compressed and extract)):
            return realpath
        else:
            if compressed:
                pattern = realpath + '.*'
                files = glob.glob(pattern)
                files = list(filter(lambda x: not x.endswith('.downloading') and not x.endswith(compressed), files))
                if files:
                    if len(files) > 1:
                        logger.debug(f'Found multiple files with {pattern}, will use the first one.')
                    return files[0]
        # realpath is where its path after exaction
        if compressed:
            realpath += compressed
        with temp_lock(path):
            if not os.path.isfile(realpath):
                path = download(url=path, save_path=realpath, verbose=verbose)
            else:
                path = realpath
    if extract and compressed:
        with temp_lock(path):
            if os.path.isfile(path):
                path = uncompress(path, verbose=verbose)
            else:  # other process must have already decompressed it and deleted it
                return get_resource(_path, save_dir, extract, prefix, append_location, verbose)
        if anchor:
            path = path_join(path, anchor)

    return path


def path_from_url(url, save_dir=hanlp_home(), prefix=HANLP_URL, append_location=True):
    """Map a URL to a local path.

    Args:
        url: Remote URL.
        save_dir: The root folder to save this file.
        prefix: The prefix of official website. Any URLs starting with this prefix will be considered official.
        append_location: Whether to put unofficial files in a ``thirdparty`` folder.

    Returns:
        The real path that this URL is mapped to.
    """
    if not save_dir:
        save_dir = hanlp_home()
    domain, relative_path = parse_url_path(url)
    if append_location:
        if not url.startswith(prefix):
            save_dir = os.path.join(save_dir, 'thirdparty', domain)
        else:
            # remove the relative path in prefix
            middle = prefix.split(domain)[-1].lstrip('/')
            if relative_path.startswith(middle):
                relative_path = relative_path[len(middle):]
        realpath = os.path.join(save_dir, relative_path)
    else:
        realpath = os.path.join(save_dir, os.path.basename(relative_path))
    return realpath


def human_bytes(file_size: int) -> str:
    file_size /= 1024  # KB
    if file_size > 1024:
        file_size /= 1024  # MB
        if file_size > 1024:
            file_size /= 1024  # GB
            return '%.1f GB' % file_size
        return '%.1f MB' % file_size
    return '%d KB' % file_size


def read_cells(filepath: str, delimiter='auto', strip=True, skip_header=False):
    filepath = get_resource(filepath)
    if delimiter == 'auto':
        if filepath.endswith('.tsv'):
            delimiter = '\t'
        elif filepath.endswith('.csv'):
            delimiter = ','
        else:
            delimiter = None
    with open(filepath, encoding='utf-8') as src:
        if skip_header:
            next(src)
        for line in src:
            line = line.strip()
            if not line:
                continue
            cells = line.split(delimiter)
            if strip:
                cells = [c.strip() for c in cells]
                yield cells


def replace_ext(filepath, ext) -> str:
    """ Replace the extension of filepath to ext.

    Args:
        filepath: Filepath to be replaced.
        ext: Extension to replace.

    Returns:
        A new path.
    """
    file_prefix, _ = os.path.splitext(filepath)
    return file_prefix + ext


def read_tsv_as_sents(tsv_file_path, ignore_prefix=None, delimiter=None):
    sent = []
    tsv_file_path = get_resource(tsv_file_path)
    with open(tsv_file_path, encoding='utf-8') as tsv_file:
        for line in tsv_file:
            if ignore_prefix and line.startswith(ignore_prefix):
                continue
            line = line.strip()
            cells = line.split(delimiter)
            if line and cells:
                sent.append(cells)
            elif sent:
                yield sent
                sent = []
    if sent:
        yield sent


def generate_words_tags_from_tsv(tsv_file_path, lower=False, gold=True, max_seq_length=None, sent_delimiter=None,
                                 char_level=False, hard_constraint=False):
    for sent in read_tsv_as_sents(tsv_file_path):
        words = [cells[0] for cells in sent]
        if max_seq_length:
            offset = 0
            # try to split the sequence to make it fit into max_seq_length
            for shorter_words in split_long_sentence_into(words, max_seq_length, sent_delimiter, char_level,
                                                          hard_constraint):
                if gold:
                    shorter_tags = [cells[1] for cells in sent[offset:offset + len(shorter_words)]]
                    offset += len(shorter_words)
                else:
                    shorter_tags = None
                if lower:
                    shorter_words = [word.lower() for word in shorter_words]
                yield shorter_words, shorter_tags
        else:
            if gold:
                try:
                    tags = [cells[1] for cells in sent]
                except:
                    raise ValueError(f'Failed to load {tsv_file_path}: {sent}')
            else:
                tags = None
            if lower:
                words = [word.lower() for word in words]
            yield words, tags


def split_file(filepath, train=0.8, dev=0.1, test=0.1, names=None, shuffle=False):
    num_samples = 0
    if filepath.endswith('.tsv'):
        for sent in read_tsv_as_sents(filepath):
            num_samples += 1
    else:
        with open(filepath, encoding='utf-8') as src:
            for sample in src:
                num_samples += 1
    splits = {'train': train, 'dev': dev, 'test': test}
    splits = dict((k, v) for k, v in splits.items() if v)
    splits = dict((k, v / sum(splits.values())) for k, v in splits.items())
    accumulated = 0
    r = []
    for k, v in splits.items():
        r.append(accumulated)
        accumulated += v
        r.append(accumulated)
        splits[k] = accumulated
    if names is None:
        names = {}
    name, ext = os.path.splitext(filepath)
    filenames = [names.get(split, name + '.' + split + ext) for split in splits.keys()]
    outs = [open(f, 'w', encoding='utf-8') for f in filenames]
    if shuffle:
        shuffle = list(range(num_samples))
        random.shuffle(shuffle)
    if filepath.endswith('.tsv'):
        src = read_tsv_as_sents(filepath)
    else:
        src = open(filepath, encoding='utf-8')
    for idx, sample in enumerate(src):
        if shuffle:
            idx = shuffle[idx]
        ratio = idx / num_samples
        for sid, out in enumerate(outs):
            if r[2 * sid] <= ratio < r[2 * sid + 1]:
                if isinstance(sample, list):
                    sample = '\n'.join('\t'.join(x) for x in sample) + '\n\n'
                out.write(sample)
                break
    if not filepath.endswith('.tsv'):
        src.close()
    for out in outs:
        out.close()
    return filenames


def fileno(file_or_fd):
    try:
        fd = getattr(file_or_fd, 'fileno', lambda: file_or_fd)()
    except:
        return None
    if not isinstance(fd, int):
        raise ValueError("Expected a file (`.fileno()`) or a file descriptor")
    return fd


@contextmanager
def stdout_redirected(to=os.devnull, stdout=None):
    """Redirect stdout to else where.
    Copied from https://stackoverflow.com/questions/4675728/redirect-stdout-to-a-file-in-python/22434262#22434262

    Args:
      to:  Target device.
      stdout:  Source device.

    """
    if windows():  # This doesn't play well with windows
        yield None
        return
    if stdout is None:
        stdout = sys.stdout
    stdout_fd = fileno(stdout)
    if not stdout_fd:
        yield None
        return
        # copy stdout_fd before it is overwritten
    # NOTE: `copied` is inheritable on Windows when duplicating a standard stream
    with os.fdopen(os.dup(stdout_fd), 'wb') as copied:
        stdout.flush()  # flush library buffers that dup2 knows nothing about
        try:
            os.dup2(fileno(to), stdout_fd)  # $ exec >&to
        except ValueError:  # filename
            with open(to, 'wb') as to_file:
                os.dup2(to_file.fileno(), stdout_fd)  # $ exec > to
        try:
            yield stdout  # allow code to be run with the redirected stdout
        finally:
            # restore stdout to its previous value
            # NOTE: dup2 makes stdout_fd inheritable unconditionally
            try:
                stdout.flush()
                os.dup2(copied.fileno(), stdout_fd)  # $ exec >&copied
            except:
                # This is the best we can do
                pass


def get_exitcode_stdout_stderr(cmd):
    """Execute the external command and get its exitcode, stdout and stderr.
    See https://stackoverflow.com/a/21000308/3730690

    Args:
      cmd: Command.

    Returns:
        Exit code, stdout, stderr.
    """
    args = shlex.split(cmd)
    proc = Popen(args, stdout=PIPE, stderr=PIPE)
    out, err = proc.communicate()
    exitcode = proc.returncode
    return exitcode, out.decode('utf-8'), err.decode('utf-8')


def run_cmd(cmd: str) -> str:
    exitcode, out, err = get_exitcode_stdout_stderr(cmd)
    if exitcode:
        raise RuntimeError(err + '\nThe command is:\n' + cmd)
    return out


@contextlib.contextmanager
def pushd(new_dir):
    previous_dir = os.getcwd()
    os.chdir(new_dir)
    try:
        yield
    finally:
        os.chdir(previous_dir)


def basename_no_ext(path):
    basename = os.path.basename(path)
    no_ext, ext = os.path.splitext(basename)
    return no_ext


def file_cache(path: str, purge=False):
    cache_name = path + '.cache'
    cache_time = os.path.getmtime(cache_name) if os.path.isfile(cache_name) and not purge else 0
    file_time = os.path.getmtime(path)
    cache_valid = cache_time > file_time
    return cache_name, cache_valid


def merge_files(files: List[str], dst: str):
    with open(dst, 'wb') as write:
        for f in files:
            with open(f, 'rb') as read:
                shutil.copyfileobj(read, write)


class TimingFileIterator(CountdownTimer):

    def __init__(self, filepath) -> None:
        super().__init__(os.path.getsize(filepath))
        self.filepath = filepath

    def __iter__(self):
        if not os.path.isfile(self.filepath):
            raise FileNotFoundError(self.filepath)
        fp = open(self.filepath, encoding='utf-8', errors='ignore')
        line = fp.readline()
        while line:
            yield line
            self.current = fp.tell()
            line = fp.readline()
        fp.close()

    def log(self, info=None, ratio_percentage=True, ratio=True, step=0, interval=0.5, erase=True,
            logger: Union[logging.Logger, bool] = None, newline=False, ratio_width=None):
        assert step == 0
        super().log(info, ratio_percentage, ratio, step, interval, erase, logger, newline, ratio_width)

    @property
    def ratio(self) -> str:
        return f'{human_bytes(self.current)}/{human_bytes(self.total)}'

    @property
    def ratio_width(self) -> int:
        return len(f'{human_bytes(self.total)}') * 2 + 1

    def close(self):
        pass


def check_outdated(package='hanlp', version=__version__, repository_url='https://pypi.python.org/pypi/%s/json'):
    """Given the name of a package on PyPI and a version (both strings), checks
    if the given version is the latest version of the package available.
    Returns a 2-tuple (installed_version, latest_version)
    `repository_url` is a `%` style format string
    to use a different repository PyPI repository URL,
    e.g. test.pypi.org or a private repository.
    The string is formatted with the package name.
    Adopted from https://github.com/alexmojaki/outdated/blob/master/outdated/__init__.py

    Args:
        package: Package name.
        version: Installed version string.
        repository_url: URL on pypi.

    Returns:
        Parsed installed version and latest version.
    """
    installed_version = Version(version)
    latest_version = get_latest_info_from_pypi(package, repository_url)
    return installed_version, latest_version


def get_latest_info_from_pypi(package='hanlp', repository_url='https://pypi.python.org/pypi/%s/json'):
    url = repository_url % package
    response = urllib.request.urlopen(url).read()
    return Version(json.loads(response)['info']['version'])


def check_version_conflicts(extras=None):
    from pkg_resources import get_distribution, Requirement, WorkingSet, VersionConflict, DistributionNotFound
    pkg = get_distribution('hanlp')
    if not extras:
        extras = pkg.extras
    if isinstance(extras, list):
        extras = tuple(extras)
    requirements: List[Requirement] = pkg.requires(extras=extras)
    error = None
    try:
        WorkingSet().resolve(
            requirements, extras=extras
        )
    except VersionConflict as e:
        error = e.with_context('hanlp').report()
    except DistributionNotFound as e:
        error = str(e)
    return error, extras


================================================
FILE: hanlp/utils/lang/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-09 18:46

__doc__ = '''
This package holds misc utils for specific languages.
'''


================================================
FILE: hanlp/utils/lang/en/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:28


================================================
FILE: hanlp/utils/lang/en/english_tokenizer.py
================================================
#!/usr/bin/env python
"""
Regex-based word tokenizers.

Note that small/full/half-width character variants are *not* covered.
If a text were to contains such characters, normalize it first.
A modified version of https://github.com/fnl/segtok

- dropped dependency on regex
- dropped web_tokenize
- supported concat word

"""

__author__ = 'Florian Leitner <florian.leitner@gmail.com>'
from re import compile, UNICODE, VERBOSE

SENTENCE_TERMINALS = '.!?\u203C\u203D\u2047\u2048\u2049\u3002' \
                     '\uFE52\uFE57\uFF01\uFF0E\uFF1F\uFF61'
"The list of valid Unicode sentence terminal characters."

# Note that Unicode the category Pd is NOT a good set for valid word-breaking hyphens,
# because it contains many dashes that should not be considered part of a word.
HYPHENS = '\u00AD\u058A\u05BE\u0F0C\u1400\u1806\u2010-\u2012\u2e17\u30A0-'
"Any valid word-breaking hyphen, including ASCII hyphen minus."

APOSTROPHES = '\'\u00B4\u02B9\u02BC\u2019\u2032'
"""All apostrophe-like marks, including the ASCII "single quote"."""

APOSTROPHE = r"[\u00B4\u02B9\u02BC\u2019\u2032]"
"""Any apostrophe-like marks, including "prime" but not the ASCII "single quote"."""

LINEBREAK = r'(?:\r\n|\n|\r|\u2028)'
"""Any valid linebreak sequence (Windows, Unix, Mac, or U+2028)."""

LETTER = r'[^\W\d_]'
"""Any Unicode letter character that can form part of a word: Ll, Lm, Lt, Lu."""

NUMBER = r'\d'
"""Any Unicode number character: Nd or Nl."""

POWER = r'\u207B?[\u00B9\u00B2\u00B3]'
"""Superscript 1, 2, and 3, optionally prefixed with a minus sign."""

SUBDIGIT = r'[\u2080-\u2089]'
"""Subscript digits."""

ALNUM = LETTER[:-1] + NUMBER + ']'
"""Any alphanumeric Unicode character: letter or number."""

HYPHEN = r'[%s]' % HYPHENS

SPACE = r'\s'
"""Any unicode space character plus the (horizontal) tab."""

APO_MATCHER = compile(APOSTROPHE, UNICODE)
"""Matcher for any apostrophe."""

HYPHENATED_LINEBREAK = compile(
    r'({alnum}{hyphen}){space}*?{linebreak}{space}*?({alnum})'.format(
        alnum=ALNUM, hyphen=HYPHEN, linebreak=LINEBREAK, space=SPACE
    ), UNICODE
)
"""
The pattern matches any alphanumeric Unicode character, followed by a hyphen,
a single line-break surrounded by optional (non-breaking) spaces,
and terminates with a alphanumeric character on this next line.
The opening char and hyphen as well as the terminating char are captured in two groups.
"""

IS_POSSESSIVE = compile(r"{alnum}+(?:{hyphen}{alnum}+)*(?:{apo}[sS]|[sS]{apo})$".format(
    alnum=ALNUM, hyphen=HYPHEN, apo="['" + APOSTROPHE[1:]
), UNICODE
)
"""A pattern that matches English words with a possessive s terminal form."""

IS_CONTRACTION = compile(r"{alnum}+(?:{hyphen}{alnum}+)*{apo}(?:d|ll|m|re|s|t|ve)$".format(
    alnum=ALNUM, hyphen=HYPHEN, apo="['" + APOSTROPHE[1:]
), UNICODE
)
"""A pattern that matches tokens with valid English contractions ``'(d|ll|m|re|s|t|ve)``."""

MAP_CONCAT_WORD = {'aint': [2, 4], 'arent': [3, 5], 'cant': [2, 4], 'cannot': [3, 6], 'coulda': [5, 6],
                   'couldnt': [5, 7], 'didnt': [3, 5], 'doncha': [2, 3, 6], 'dont': [2, 4],
                   'doesnt': [4, 6], 'dunno': [2, 3, 5], 'finna': [3, 5], 'gimme': [3, 5], 'gonna': [3, 5],
                   'gotta': [3, 5], 'hadnt': [3, 5], 'hasnt': [3, 5], 'havent': [4, 6], 'isnt': [2, 4],
                   'itd': [2, 3], 'itll': [2, 4], 'lemme': [3, 5], 'lets': [3, 4], 'mightnt': [5, 7],
                   'mustnt': [4, 6], 'shant': [3, 5], 'shoulda': [6, 7], 'shouldnt': [6, 8],
                   'thatd': [4, 5], 'thatll': [4, 6], 'thats': [4, 5], 'theyd': [4, 5], 'theyre': [4, 6],
                   'theyve': [4, 6], 'wanna': [3, 5], 'wasnt': [3, 5], 'weve': [2, 4], 'werent': [4, 6],
                   'whadya': [3, 4, 6], 'whatcha': [4, 7], 'whatre': [4, 6], 'whats': [4, 5],
                   'whatve': [4, 6], 'whatz': [4, 5], 'whod': [3, 4], 'wholl': [3, 5], 'woncha': [2, 3, 6],
                   'wont': [2, 4], 'woulda': [5, 6], 'wouldnt': [5, 7], 'youd': [3, 4], 'youll': [3, 5],
                   'youve': [3, 5], "'tis": [2, 4], "'twas": [2, 5], "d'ye": [2, 4], "don'cha": [2, 4, 7],
                   "i'mma": [1, 3, 5], "i'mmm": [1, 5], "more'n": [4, 6], '’tis': [2, 4], '’twas': [2, 5],
                   'd’ye': [2, 4], 'don’cha': [2, 4, 7], 'i’mma': [1, 3, 5], 'i’mmm': [1, 5],
                   'more’n': [4, 6]}

RE_APOSTROPHE = compile(r'(?i)[a-z](n[\'\u2019]t|[\'\u2019](ll|nt|re|ve|[dmstz]))(\W|$)')


def split_possessive_markers(tokens):
    """
    A function to split possessive markers at the end of alphanumeric (and hyphenated) tokens.

    Takes the output of any of the tagger functions and produces and updated list.
    To use it, simply wrap the tagger function, for example::

    >>> my_sentence = "This is Fred's latest book."
    >>> split_possessive_markers(tokenize_english(my_sentence))
    ['This', 'is', 'Fred', "'s", 'latest', 'book', '.']

    :param tokens: a list of tokens
    :returns: an updated list if a split was made or the original list otherwise
    """
    idx = -1

    for token in list(tokens):
        idx += 1

        if IS_POSSESSIVE.match(token) is not None:
            if token[-1].lower() == 's' and token[-2] in APOSTROPHES:
                tokens.insert(idx, token[:-2])
                idx += 1
                tokens[idx] = token[-2:]
            elif token[-2].lower() == 's' and token[-1] in APOSTROPHES:
                tokens.insert(idx, token[:-1])
                idx += 1
                tokens[idx] = token[-1:]

    return tokens


def split_contractions(tokens):
    """
    A function to split apostrophe contractions at the end of alphanumeric (and hyphenated) tokens.

    Takes the output of any of the tagger functions and produces and updated list.

    :param tokens: a list of tokens
    :returns: an updated list if a split was made or the original list otherwise
    """
    idx = -1

    for token in list(tokens):
        idx += 1

        if IS_CONTRACTION.match(token) is not None:
            length = len(token)

            if length > 1:
                for pos in range(length - 1, -1, -1):
                    if token[pos] in APOSTROPHES:
                        if 2 < length and pos + 2 == length and token[-1] == 't' and token[pos - 1] == 'n':
                            pos -= 1

                        tokens.insert(idx, token[:pos])
                        idx += 1
                        tokens[idx] = token[pos:]

    return tokens


def _matches(regex):
    """Regular expression compiling function decorator."""

    def match_decorator(fn):
        automaton = compile(regex, UNICODE | VERBOSE)
        fn.split = automaton.split
        fn.match = automaton.match
        return fn

    return match_decorator


@_matches(r'\s+')
def space_tokenizer(sentence):
    """
    For a given input `sentence`, return a list of its tokens.

    Split on Unicode spaces ``\\s+`` (i.e., any kind of **Unicode** space character).
    The separating space characters are not included in the resulting token list.
    """
    return [token for token in space_tokenizer.split(sentence) if token]


@_matches(r'(%s+)' % ALNUM)
def symbol_tokenizer(sentence):
    """
    The symbol tagger extends the :func:`space_tokenizer` by separating alphanumerics.

    Separates alphanumeric Unicode character sequences in already space-split tokens.
    """
    return [token for span in space_tokenizer(sentence) for
            token in symbol_tokenizer.split(span) if token]


@_matches(r"""((?:
    # Dots, except ellipsis
    {alnum} \. (?!\.\.)
    | # Comma, surrounded by digits (e.g., chemicals) or letters
    {alnum} , (?={alnum})
    | # Colon, surrounded by digits (e.g., time, references)
    {number} : (?={number})
    | # Hyphen, surrounded by digits (e.g., DNA endings: "5'-ACGT-3'") or letters
    {alnum} {apo}? {hyphen} (?={alnum})  # incl. optional apostrophe for DNA segments
    | # Apostophes, non-consecutive
    {apo} (?!{apo})
    | # ASCII single quote, surrounded by digits or letters (no dangling allowed)
    {alnum} ' (?={alnum})
    | # ASCII single quote after an s and at the token's end
    s ' $
    | # Terminal dimensions (superscript minus, 1, 2, and 3) attached to physical units
    #  size-prefix                 unit-acronym    dimension
    \b [yzafpn\u00B5mcdhkMGTPEZY]? {letter}{{1,3}} {power} $
    | # Atom counts (subscript numbers) and ionization states (optional superscript
    #   2 or 3 followed by a + or -) are attached to valid fragments of a chemical formula
    \b (?:[A-Z][a-z]?|[\)\]])+ {subdigit}+ (?:[\u00B2\u00B3]?[\u207A\u207B])?
    | # Any (Unicode) letter, digit, or the underscore
    {alnum}
    )+)""".format(alnum=ALNUM, apo=APOSTROPHE, power=POWER, subdigit=SUBDIGIT,
                  hyphen=HYPHEN, letter=LETTER, number=NUMBER))
def tokenize_english(sentence):
    """
    A modified version of the segtok tagger: https://github.com/fnl/segtok
    This tagger extends the alphanumeric :func:`symbol_tokenizer` by splitting fewer cases:

    1. Dots appearing after a letter are maintained as part of the word, except for the last word
       in a sentence if that dot is the sentence terminal. Therefore, abbreviation marks (words
       containing or ending in a ``.``, like "i.e.") remain intact and URL or ID segments remain
       complete ("www.ex-ample.com", "EC1.2.3.4.5", etc.). The only dots that never are attached
       are triple dots (``...``; ellipsis).
    2. Commas surrounded by alphanumeric characters are maintained in the word, too, e.g. ``a,b``.
       Colons surrounded by digits are maintained, e.g., 'at 12:30pm' or 'Isaiah 12:3'.
       Commas, semi-colons, and colons dangling at the end of a token are always spliced off.
    3. Any two alphanumeric letters that are separated by a single hyphen are joined together;
       Those "inner" hyphens may optionally be followed by a linebreak surrounded by spaces;
       The spaces will be removed, however. For example, ``Hel- \\r\\n \t lo`` contains a (Windows)
       linebreak and will be returned as ``Hel-lo``.
    4. Apostrophes are always allowed in words as long as they are not repeated; The single quote
       ASCII letter ``'`` is only allowed as a terminal apostrophe after the letter ``s``,
       otherwise it must be surrounded by letters. To support DNA and chemicals, a apostrophe
       (prime) may be located before the hyphen, as in the single token "5'-ACGT-3'" (if any
       non-ASCII hyphens are used instead of the shown single quote).
    5. Superscript 1, 2, and 3, optionally prefixed with a superscript minus, are attached to a
       word if it is no longer than 3 letters (optionally 4 if the first letter is a power prefix
       in the range from yocto, y (10^-24) to yotta, Y (10^+24)).
    6. Subscript digits are attached if prefixed with letters that look like a chemical formula.
    """
    if not sentence:
        return []
    flat = not isinstance(sentence, list)
    if flat:
        sents = [sentence]
    else:
        sents = sentence
    results = []
    for sentence in sents:
        pruned = HYPHENATED_LINEBREAK.sub(r'\1\2', sentence)
        tokens = [token for span in space_tokenizer(pruned) for
                  token in tokenize_english.split(span) if token]

        # splice the sentence terminal off the last word/token if it has any at its borders
        # only look for the sentence terminal in the last three tokens
        for idx, word in enumerate(reversed(tokens[-3:]), 1):
            if (tokenize_english.match(word) and not APO_MATCHER.match(word)) or \
                    any(t in word for t in SENTENCE_TERMINALS):
                last = len(word) - 1

                if 0 == last or u'...' == word:
                    # any case of "..." or any single char (last == 0)
                    pass  # leave the token as it is
                elif any(word.rfind(t) == last for t in SENTENCE_TERMINALS):
                    # "stuff."
                    tokens[-idx] = word[:-1]
                    tokens.insert(len(tokens) - idx + 1, word[-1])
                elif any(word.find(t) == 0 for t in SENTENCE_TERMINALS):
                    # ".stuff"
                    tokens[-idx] = word[0]
                    tokens.insert(len(tokens) - idx + 1, word[1:])

                break

        # keep splicing off any dangling commas and (semi-) colons
        dirty = True
        while dirty:
            dirty = False

            for idx, word in enumerate(reversed(tokens), 1):
                while len(word) > 1 and word[-1] in u',;:':
                    char = word[-1]  # the dangling comma/colon
                    word = word[:-1]
                    tokens[-idx] = word
                    tokens.insert(len(tokens) - idx + 1, char)
                    idx += 1
                    dirty = True
                if dirty:
                    break  # restart check to avoid index errors

        # split concat words
        chunks = []
        for token in tokens:
            t = MAP_CONCAT_WORD.get(token.lower(), None)
            if t:
                i = 0
                for j in t:
                    chunks.append(token[i:j])
                    i = j
            else:
                chunks.append(token)
        tokens = chunks
        # split APOSTROPHE
        chunks = []
        for token in tokens:
            m = RE_APOSTROPHE.search(token)
            if m:
                chunks.append(token[:m.start(1)])
                chunks.append(token[m.start(1):m.end(1)])
                if m.end(1) < len(token):
                    chunks.append(token[m.end(1):])
            else:
                chunks.append(token)
        tokens = chunks
        results.append(tokens)
    return results[0] if flat else results

================================================
FILE: hanlp/utils/lang/ja/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-13 13:24


================================================
FILE: hanlp/utils/lang/ja/bert_tok.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-13 13:24
from typing import Union, Optional

from transformers import BertTokenizerFast, TensorType, BatchEncoding, BertJapaneseTokenizer as _BertJapaneseTokenizer
from transformers.file_utils import PaddingStrategy
from transformers.tokenization_utils_base import TextInput, PreTokenizedInput, EncodedInput, TruncationStrategy


class BertJapaneseTokenizer(_BertJapaneseTokenizer):
    # We may need to customize character level tokenization to handle English words and URLs
    pass


class BertJapaneseTokenizerFast(BertTokenizerFast):
    def encode_plus(
            self,
            text: Union[TextInput, PreTokenizedInput, EncodedInput],
            text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
            add_special_tokens: bool = True,
            padding: Union[bool, str, PaddingStrategy] = False,
            truncation: Union[bool, str, TruncationStrategy] = False,
            max_length: Optional[int] = None,
            stride: int = 0,
            is_split_into_words: bool = False,
            pad_to_multiple_of: Optional[int] = None,
            return_tensors: Optional[Union[str, TensorType]] = None,
            return_token_type_ids: Optional[bool] = None,
            return_attention_mask: Optional[bool] = None,
            return_overflowing_tokens: bool = False,
            return_special_tokens_mask: bool = False,
            return_offsets_mapping: bool = False,
            return_length: bool = False,
            verbose: bool = True,
            **kwargs
    ) -> BatchEncoding:
        """
        Tokenize and prepare for the model a sequence or a pair of sequences.

        .. warning::
            This method is deprecated, ``__call__`` should be used instead.

        Args:
            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the latter only for not-fast tokenizers)):
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
                ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
                method).
            text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
                the ``tokenize`` method) or a list of integers (tokenized string ids using the
                ``convert_tokens_to_ids`` method).
        """
        text = list(text)
        is_split_into_words = True
        encoding = BertJapaneseTokenizer.encode_plus(self,
                                                     text,
                                                     text_pair,
                                                     add_special_tokens,
                                                     padding,
                                                     truncation,
                                                     max_length,
                                                     stride,
                                                     is_split_into_words,
                                                     pad_to_multiple_of,
                                                     return_tensors,
                                                     return_token_type_ids,
                                                     return_attention_mask,
                                                     return_overflowing_tokens,
                                                     return_special_tokens_mask,
                                                     return_offsets_mapping,
                                                     return_length,
                                                     verbose,
                                                     **kwargs
                                                     )
        offsets = encoding.encodings[0].offsets
        fixed_offsets = [(b + i, e + i) for i, (b, e) in enumerate(offsets)]
        # TODO: This doesn't work with rust tokenizers
        encoding.encodings[0].offsets.clear()
        encoding.encodings[0].offsets.extend(fixed_offsets)
        return encoding


================================================
FILE: hanlp/utils/lang/zh/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-09 18:47

================================================
FILE: hanlp/utils/lang/zh/char_table.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-09 19:07
from typing import List

from hanlp.utils.io_util import get_resource
from hanlp_common.io import load_json

HANLP_CHAR_TABLE_TXT = 'https://file.hankcs.com/corpus/char_table.zip#CharTable.txt'
HANLP_CHAR_TABLE_JSON = 'https://file.hankcs.com/corpus/char_table.json.zip'


class CharTable:
    convert = {}

    @staticmethod
    def convert_char(c):
        if not CharTable.convert:
            CharTable._init()
        return CharTable.convert.get(c, c)

    @staticmethod
    def normalize_text(text: str) -> str:
        return ''.join(CharTable.convert_char(c) for c in text)

    @staticmethod
    def normalize_chars(chars: List[str]) -> List[str]:
        return [CharTable.convert_char(c) for c in chars]

    @staticmethod
    def _init():
        CharTable.convert = CharTable.load()

    @staticmethod
    def load():
        mapper = {}
        with open(get_resource(HANLP_CHAR_TABLE_TXT), encoding='utf-8') as src:
            for line in src:
                cells = line.rstrip('\n')
                if len(cells) != 3:
                    continue
                a, _, b = cells
                mapper[a] = b
        return mapper


class JsonCharTable(CharTable):

    @staticmethod
    def load():
        return load_json(get_resource(HANLP_CHAR_TABLE_JSON))


================================================
FILE: hanlp/utils/lang/zh/localization.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-05 02:09

task = {
    'dep': '依存句法树',
    'token': '单词',
    'pos': '词性',
    'ner': '命名实体',
    'srl': '语义角色'
}

pos = {
    'VA': '表语形容词', 'VC': '系动词', 'VE': '动词有无', 'VV': '其他动词', 'NR': '专有名词', 'NT': '时间名词', 'NN': '其他名词',
    'LC': '方位词', 'PN': '代词', 'DT': '限定词', 'CD': '概数词', 'OD': '序数词', 'M': '量词', 'AD': '副词', 'P': '介词',
    'CC': '并列连接词', 'CS': '从属连词', 'DEC': '补语成分“的”', 'DEG': '属格“的”', 'DER': '表结果的“得”', 'DEV': '表方式的“地”',
    'AS': '动态助词', 'SP': '句末助词', 'ETC': '表示省略', 'MSP': '其他小品词', 'IJ': '句首感叹词', 'ON': '象声词',
    'LB': '长句式表被动', 'SB': '短句式表被动', 'BA': '把字句', 'JJ': '其他名词修饰语', 'FW': '外来语', 'PU': '标点符号',
    'NOI': '噪声', 'URL': '网址'
}

ner = {
    'NT': '机构团体', 'NS': '地名', 'NR': '人名'
}

dep = {
    'nn': '复合名词修饰', 'punct': '标点符号', 'nsubj': '名词性主语', 'conj': '连接性状语', 'dobj': '直接宾语', 'advmod': '名词性状语',
    'prep': '介词性修饰语', 'nummod': '数词修饰语', 'amod': '形容词修饰语', 'pobj': '介词性宾语', 'rcmod': '相关关系', 'cpm': '补语',
    'assm': '关联标记', 'assmod': '关联修饰', 'cc': '并列关系', 'elf': '类别修饰', 'ccomp': '从句补充', 'det': '限定语', 'lobj': '时间介词',
    'range': '数量词间接宾语', 'asp': '时态标记', 'tmod': '时间修饰语', 'plmod': '介词性地点修饰', 'attr': '属性', 'mmod': '情态动词',
    'loc': '位置补语', 'top': '主题', 'pccomp': '介词补语', 'etc': '省略关系', 'lccomp': '位置补语', 'ordmod': '量词修饰',
    'xsubj': '控制主语', 'neg': '否定修饰', 'rcomp': '结果补语', 'comod': '并列联合动词', 'vmod': '动词修饰', 'prtmod': '小品词',
    'ba': '把字关系', 'dvpm': '地字修饰', 'dvpmod': '地字动词短语', 'prnmod': '插入词修饰', 'cop': '系动词', 'pass': '被动标记',
    'nsubjpass': '被动名词主语', 'clf': '类别修饰', 'dep': '依赖关系', 'root': '核心关系'
}


================================================
FILE: hanlp/utils/log_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-24 22:12
import datetime
import io
import logging
import os
import sys
from logging import LogRecord

import termcolor

from hanlp_common.constant import IPYTHON


class ColoredFormatter(logging.Formatter):
    def __init__(self, fmt=None, datefmt=None, style='%', enable=True):
        super().__init__(fmt, datefmt, style)
        self.enable = enable

    def formatMessage(self, record: LogRecord) -> str:
        message = super().formatMessage(record)
        if self.enable:
            return color_format(message)
        else:
            return remove_color_tag(message)


def init_logger(name=None, root_dir=None, level=logging.INFO, mode='w',
                fmt="%(asctime)s %(levelname)s %(message)s",
                datefmt='%Y-%m-%d %H:%M:%S') -> logging.Logger:
    if not name:
        name = datetime.datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
    rootLogger = logging.getLogger(os.path.join(root_dir, name) if root_dir else name)
    rootLogger.propagate = False

    consoleHandler = logging.StreamHandler(sys.stdout)  # stderr will be rendered as red which is bad
    consoleHandler.setFormatter(ColoredFormatter(fmt, datefmt=datefmt))
    attached_to_std = False
    for handler in rootLogger.handlers:
        if isinstance(handler, logging.StreamHandler):
            if handler.stream == sys.stderr or handler.stream == sys.stdout:
                attached_to_std = True
                break
    if not attached_to_std:
        rootLogger.addHandler(consoleHandler)
    rootLogger.setLevel(level)
    consoleHandler.setLevel(level)

    if root_dir:
        os.makedirs(root_dir, exist_ok=True)
        log_path = "{0}/{1}.log".format(root_dir, name)
        fileHandler = logging.FileHandler(log_path, mode=mode)
        fileHandler.setFormatter(ColoredFormatter(fmt, datefmt=datefmt, enable=False))
        rootLogger.addHandler(fileHandler)
        fileHandler.setLevel(level)

    return rootLogger


logger = init_logger(name='hanlp', level=os.environ.get('HANLP_LOG_LEVEL', 'INFO'))


def enable_debug(debug=True):
    logger.setLevel(logging.DEBUG if debug else logging.ERROR)


class ErasablePrinter(object):
    def __init__(self, out=sys.stderr):
        self._last_print_width = 0
        self.out = out

    def erase(self):
        if self._last_print_width:
            if IPYTHON:
                self.out.write("\r")
                self.out.write(" " * self._last_print_width)
            else:
                self.out.write("\b" * self._last_print_width)
                self.out.write(" " * self._last_print_width)
                self.out.write("\b" * self._last_print_width)
            self.out.write("\r")  # \r is essential when multi-lines were printed
            self._last_print_width = 0

    def print(self, msg: str, color=True):
        self.erase()
        if color:
            if IPYTHON:
                msg, _len = color_format_len(msg)
                _len = len(msg)
            else:
                msg, _len = color_format_len(msg)
            self._last_print_width = _len
        else:
            self._last_print_width = len(msg)
        self.out.write(msg)
        self.out.flush()


_printer = ErasablePrinter()


def flash(line: str, color=True):
    _printer.print(line, color)


def color_format(msg: str):
    for tag in termcolor.COLORS, termcolor.HIGHLIGHTS, termcolor.ATTRIBUTES:
        for c, v in tag.items():
            start, end = f'[{c}]', f'[/{c}]'
            msg = msg.replace(start, '\033[%dm' % v).replace(end, termcolor.RESET)
    return msg


def remove_color_tag(msg: str):
    for tag in termcolor.COLORS, termcolor.HIGHLIGHTS, termcolor.ATTRIBUTES:
        for c, v in tag.items():
            start, end = f'[{c}]', f'[/{c}]'
            msg = msg.replace(start, '').replace(end, '')
    return msg


def color_format_len(msg: str):
    _len = len(msg)
    for tag in termcolor.COLORS, termcolor.HIGHLIGHTS, termcolor.ATTRIBUTES:
        for c, v in tag.items():
            start, end = f'[{c}]', f'[/{c}]'
            msg, delta = _replace_color_offset(msg, start, '\033[%dm' % v)
            _len -= delta
            msg, delta = _replace_color_offset(msg, end, termcolor.RESET)
            _len -= delta
    return msg, _len


def _replace_color_offset(msg: str, color: str, ctrl: str):
    chunks = msg.split(color)
    delta = (len(chunks) - 1) * len(color)
    return ctrl.join(chunks), delta


def cprint(*args, file=None, **kwargs):
    out = io.StringIO()
    print(*args, file=out, **kwargs)
    text = out.getvalue()
    out.close()
    c_text = color_format(text)
    print(c_text, end='', file=file)


def main():
    # cprint('[blink][yellow]...[/yellow][/blink]')
    # show_colors_and_formats()
    show_colors()
    # print('previous', end='')
    # for i in range(10):
    #     flash(f'[red]{i}[/red]')


def show_colors_and_formats():
    msg = ''
    for c in termcolor.COLORS.keys():
        for h in termcolor.HIGHLIGHTS.keys():
            for a in termcolor.ATTRIBUTES.keys():
                msg += f'[{c}][{h}][{a}] {c}+{h}+{a} [/{a}][/{h}][/{c}]'
    logger.info(msg)


def show_colors():
    msg = ''
    for c in termcolor.COLORS.keys():
        cprint(f'[{c}]"{c}",[/{c}]')


# Generates tables for Doxygen flavored Markdown.  See the Doxygen
# documentation for details:
#   http://www.doxygen.nl/manual/markdown.html#md_tables

# Translation dictionaries for table alignment


if __name__ == '__main__':
    main()


================================================
FILE: hanlp/utils/rules.py
================================================
import re

_SEPARATOR = r'@'
_RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)
_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE)
_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE)
_UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + _SEPARATOR + r'(\w)', re.UNICODE)
_UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + _SEPARATOR + r'(\w)', re.UNICODE)


def _replace_with_separator(text, separator, regexs):
    replacement = r"\1" + separator + r"\2"
    result = text
    for regex in regexs:
        result = regex.sub(replacement, result)
    return result


def split_sentence(text, best=True):
    text = re.sub(r'([。！？?])([^”’])', r"\1\n\2", text)
    text = re.sub(r'(\.{6})([^”’])', r"\1\n\2", text)
    text = re.sub(r'(…{2})([^”’])', r"\1\n\2", text)
    text = re.sub(r'([。！？?][”’])([^，。！？?])', r'\1\n\2', text)
    for chunk in text.split("\n"):
        chunk = chunk.strip()
        if not chunk:
            continue
        if not best:
            yield chunk
            continue
        processed = _replace_with_separator(chunk, _SEPARATOR, [_AB_SENIOR, _AB_ACRONYM])
        sents = list(_RE_SENTENCE.finditer(processed))
        if not sents:
            yield chunk
            continue
        for sentence in sents:
            sentence = _replace_with_separator(sentence.group(), r" ", [_UNDO_AB_SENIOR, _UNDO_AB_ACRONYM])
            yield sentence


================================================
FILE: hanlp/utils/span_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-12 20:34
import warnings
from typing import Dict, List, Tuple, Callable, Set, Optional


def generate_words_per_line(file_path):
    with open(file_path, encoding='utf-8') as src:
        for line in src:
            cells = line.strip().split()
            if not cells:
                continue
            yield cells


def words_to_bmes(words):
    tags = []
    for w in words:
        if not w:
            raise ValueError('{} contains None or zero-length word {}'.format(str(words), w))
        if len(w) == 1:
            tags.append('S')
        else:
            tags.extend(['B'] + ['M'] * (len(w) - 2) + ['E'])
    return tags


def words_to_bi(words):
    tags = []
    for w in words:
        if not w:
            raise ValueError('{} contains None or zero-length word {}'.format(str(words), w))
        tags.extend(['B'] + ['I'] * (len(w) - 1))
    return tags


def bmes_to_words(chars, tags):
    result = []
    if len(chars) == 0:
        return result
    word = chars[0]

    for c, t in zip(chars[1:], tags[1:]):
        if t == 'B' or t == 'S':
            result.append(word)
            word = ''
        word += c
    if len(word) != 0:
        result.append(word)

    return result


def bmes_to_spans(tags):
    result = []
    offset = 0
    pre_offset = 0
    for t in tags[1:]:
        offset += 1
        if t == 'B' or t == 'S':
            result.append((pre_offset, offset))
            pre_offset = offset
    if offset != len(tags):
        result.append((pre_offset, len(tags)))

    return result


def bmes_of(sentence, segmented):
    if segmented:
        chars = []
        tags = []
        words = sentence.split()
        for w in words:
            chars.extend(list(w))
            if len(w) == 1:
                tags.append('S')
            else:
                tags.extend(['B'] + ['M'] * (len(w) - 2) + ['E'])
    else:
        chars = list(sentence)
        tags = ['S'] * len(chars)
    return chars, tags


def iobes_to_bilou(src, dst):
    with open(src) as src, open(dst, 'w') as out:
        for line in src:
            line = line.strip()
            if not line:
                out.write('\n')
                continue
            word, tag = line.split('\t')
            if tag.startswith('E-'):
                tag = 'L-' + tag[2:]
            elif tag.startswith('S-'):
                tag = 'U-' + tag[2:]
            out.write(f'{word}\t{tag}\n')


def allowed_transitions(constraint_type: str, labels: Dict[int, str]) -> List[Tuple[int, int]]:
    """
    Given labels and a constraint type, returns the allowed transitions. It will
    additionally include transitions for the start and end states, which are used
    by the conditional random field.

    # Parameters

    constraint_type : `str`, required
        Indicates which constraint to apply. Current choices are
        "BIO", "IOB1", "BIOUL", and "BMES".
    labels : `Dict[int, str]`, required
        A mapping {label_id -> label}. Most commonly this would be the value from
        Vocabulary.get_index_to_token_vocabulary()

    # Returns

    `List[Tuple[int, int]]`
        The allowed transitions (from_label_id, to_label_id).
    """
    num_labels = len(labels)
    start_tag = num_labels
    end_tag = num_labels + 1
    labels_with_boundaries = list(labels.items()) + [(start_tag, "START"), (end_tag, "END")]

    allowed = []
    for from_label_index, from_label in labels_with_boundaries:
        if from_label in ("START", "END"):
            from_tag = from_label
            from_entity = ""
        else:
            from_tag = from_label[0]
            from_entity = from_label[1:]
        for to_label_index, to_label in labels_with_boundaries:
            if to_label in ("START", "END"):
                to_tag = to_label
                to_entity = ""
            else:
                to_tag = to_label[0]
                to_entity = to_label[1:]
            if is_transition_allowed(constraint_type, from_tag, from_entity, to_tag, to_entity):
                allowed.append((from_label_index, to_label_index))
    return allowed


def is_transition_allowed(
    constraint_type: str, from_tag: str, from_entity: str, to_tag: str, to_entity: str
):
    """
    Given a constraint type and strings `from_tag` and `to_tag` that
    represent the origin and destination of the transition, return whether
    the transition is allowed under the given constraint type.

    # Parameters

    constraint_type : `str`, required
        Indicates which constraint to apply. Current choices are
        "BIO", "IOB1", "BIOUL", and "BMES".
    from_tag : `str`, required
        The tag that the transition originates from. For example, if the
        label is `I-PER`, the `from_tag` is `I`.
    from_entity : `str`, required
        The entity corresponding to the `from_tag`. For example, if the
        label is `I-PER`, the `from_entity` is `PER`.
    to_tag : `str`, required
        The tag that the transition leads to. For example, if the
        label is `I-PER`, the `to_tag` is `I`.
    to_entity : `str`, required
        The entity corresponding to the `to_tag`. For example, if the
        label is `I-PER`, the `to_entity` is `PER`.

    # Returns

    `bool`
        Whether the transition is allowed under the given `constraint_type`.
    """

    if to_tag == "START" or from_tag == "END":
        # Cannot transition into START or from END
        return False

    if constraint_type == "BIOUL":
        if from_tag == "START":
            return to_tag in ("O", "B", "U")
        if to_tag == "END":
            return from_tag in ("O", "L", "U")
        return any(
            [
                # O can transition to O, B-* or U-*
                # L-x can transition to O, B-*, or U-*
                # U-x can transition to O, B-*, or U-*
                from_tag in ("O", "L", "U") and to_tag in ("O", "B", "U"),
                # B-x can only transition to I-x or L-x
                # I-x can only transition to I-x or L-x
                from_tag in ("B", "I") and to_tag in ("I", "L") and from_entity == to_entity,
            ]
        )
    elif constraint_type == "BIO":
        if from_tag == "START":
            return to_tag in ("O", "B")
        if to_tag == "END":
            return from_tag in ("O", "B", "I")
        return any(
            [
                # Can always transition to O or B-x
                to_tag in ("O", "B"),
                # Can only transition to I-x from B-x or I-x
                to_tag == "I" and from_tag in ("B", "I") and from_entity == to_entity,
            ]
        )
    elif constraint_type == "IOB1":
        if from_tag == "START":
            return to_tag in ("O", "I")
        if to_tag == "END":
            return from_tag in ("O", "B", "I")
        return any(
            [
                # Can always transition to O or I-x
                to_tag in ("O", "I"),
                # Can only transition to B-x from B-x or I-x, where
                # x is the same tag.
                to_tag == "B" and from_tag in ("B", "I") and from_entity == to_entity,
            ]
        )
    elif constraint_type == "BMES":
        if from_tag == "START":
            return to_tag in ("B", "S")
        if to_tag == "END":
            return from_tag in ("E", "S")
        return any(
            [
                # Can only transition to B or S from E or S.
                to_tag in ("B", "S") and from_tag in ("E", "S"),
                # Can only transition to M-x from B-x, where
                # x is the same tag.
                to_tag == "M" and from_tag in ("B", "M") and from_entity == to_entity,
                # Can only transition to E-x from B-x or M-x, where
                # x is the same tag.
                to_tag == "E" and from_tag in ("B", "M") and from_entity == to_entity,
            ]
        )
    else:
        raise ValueError(f"Unknown constraint type: {constraint_type}")


TypedSpan = Tuple[int, Tuple[int, int]]
TypedStringSpan = Tuple[str, Tuple[int, int]]


class InvalidTagSequence(Exception):
    def __init__(self, tag_sequence=None):
        super().__init__()
        self.tag_sequence = tag_sequence

    def __str__(self):
        return " ".join(self.tag_sequence)


T = str


def enumerate_spans(
        sentence: List[T],
        offset: int = 0,
        max_span_width: int = None,
        min_span_width: int = 1,
        filter_function: Callable[[List[T]], bool] = None,
) -> List[Tuple[int, int]]:
    """
    Given a sentence, return all token spans within the sentence. Spans are `inclusive`.
    Additionally, you can provide a maximum and minimum span width, which will be used
    to exclude spans outside of this range.

    Finally, you can provide a function mapping `List[T] -> bool`, which will
    be applied to every span to decide whether that span should be included. This
    allows filtering by length, regex matches, pos tags or any Spacy `Token`
    attributes, for example.

    # Parameters

    sentence : `List[T]`, required.
        The sentence to generate spans for. The type is generic, as this function
        can be used with strings, or Spacy `Tokens` or other sequences.
    offset : `int`, optional (default = `0`)
        A numeric offset to add to all span start and end indices. This is helpful
        if the sentence is part of a larger structure, such as a document, which
        the indices need to respect.
    max_span_width : `int`, optional (default = `None`)
        The maximum length of spans which should be included. Defaults to len(sentence).
    min_span_width : `int`, optional (default = `1`)
        The minimum length of spans which should be included. Defaults to 1.
    filter_function : `Callable[[List[T]], bool]`, optional (default = `None`)
        A function mapping sequences of the passed type T to a boolean value.
        If `True`, the span is included in the returned spans from the
        sentence, otherwise it is excluded..
    """
    max_span_width = max_span_width or len(sentence)
    filter_function = filter_function or (lambda x: True)
    spans: List[Tuple[int, int]] = []

    for start_index in range(len(sentence)):
        last_end_index = min(start_index + max_span_width, len(sentence))
        first_end_index = min(start_index + min_span_width - 1, len(sentence))
        for end_index in range(first_end_index, last_end_index):
            start = offset + start_index
            end = offset + end_index
            # add 1 to end index because span indices are inclusive.
            if filter_function(sentence[slice(start_index, end_index + 1)]):
                spans.append((start, end))
    return spans


def bio_tags_to_spans(
        tag_sequence: List[str], classes_to_ignore: List[str] = None
) -> List[TypedStringSpan]:
    """
    Given a sequence corresponding to BIO tags, extracts spans.
    Spans are inclusive and can be of zero length, representing a single word span.
    Ill-formed spans are also included (i.e those which do not start with a "B-LABEL"),
    as otherwise it is possible to get a perfect precision score whilst still predicting
    ill-formed spans in addition to the correct spans. This function works properly when
    the spans are unlabeled (i.e., your labels are simply "B", "I", and "O").

    # Parameters

    tag_sequence : `List[str]`, required.
        The integer class labels for a sequence.
    classes_to_ignore : `List[str]`, optional (default = `None`).
        A list of string class labels `excluding` the bio tag
        which should be ignored when extracting spans.

    # Returns

    spans : `List[TypedStringSpan]`
        The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)).
        Note that the label `does not` contain any BIO tag prefixes.
    """
    classes_to_ignore = classes_to_ignore or []
    spans: Set[Tuple[str, Tuple[int, int]]] = set()
    span_start = 0
    span_end = 0
    active_conll_tag = None
    for index, string_tag in enumerate(tag_sequence):
        # Actual BIO tag.
        bio_tag = string_tag[0]
        if bio_tag not in ["B", "I", "O"]:
            raise InvalidTagSequence(tag_sequence)
        conll_tag = string_tag[2:]
        if bio_tag == "O" or conll_tag in classes_to_ignore:
            # The span has ended.
            if active_conll_tag is not None:
                spans.add((active_conll_tag, (span_start, span_end)))
            active_conll_tag = None
            # We don't care about tags we are
            # told to ignore, so we do nothing.
            continue
        elif bio_tag == "B":
            # We are entering a new span; reset indices
            # and active tag to new span.
            if active_conll_tag is not None:
                spans.add((active_conll_tag, (span_start, span_end)))
            active_conll_tag = conll_tag
            span_start = index
            span_end = index
        elif bio_tag == "I" and conll_tag == active_conll_tag:
            # We're inside a span.
            span_end += 1
        else:
            # This is the case the bio label is an "I", but either:
            # 1) the span hasn't started - i.e. an ill formed span.
            # 2) The span is an I tag for a different conll annotation.
            # We'll process the previous span if it exists, but also
            # include this span. This is important, because otherwise,
            # a model may get a perfect F1 score whilst still including
            # false positive ill-formed spans.
            if active_conll_tag is not None:
                spans.add((active_conll_tag, (span_start, span_end)))
            active_conll_tag = conll_tag
            span_start = index
            span_end = index
    # Last token might have been a part of a valid span.
    if active_conll_tag is not None:
        spans.add((active_conll_tag, (span_start, span_end)))
    return list(spans)


def iob1_tags_to_spans(
        tag_sequence: List[str], classes_to_ignore: List[str] = None
) -> List[TypedStringSpan]:
    """
    Given a sequence corresponding to IOB1 tags, extracts spans.
    Spans are inclusive and can be of zero length, representing a single word span.
    Ill-formed spans are also included (i.e., those where "B-LABEL" is not preceded
    by "I-LABEL" or "B-LABEL").

    # Parameters

    tag_sequence : `List[str]`, required.
        The integer class labels for a sequence.
    classes_to_ignore : `List[str]`, optional (default = `None`).
        A list of string class labels `excluding` the bio tag
        which should be ignored when extracting spans.

    # Returns

    spans : `List[TypedStringSpan]`
        The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)).
        Note that the label `does not` contain any BIO tag prefixes.
    """
    classes_to_ignore = classes_to_ignore or []
    spans: Set[Tuple[str, Tuple[int, int]]] = set()
    span_start = 0
    span_end = 0
    active_conll_tag = None
    prev_bio_tag = None
    prev_conll_tag = None
    for index, string_tag in enumerate(tag_sequence):
        curr_bio_tag = string_tag[0]
        curr_conll_tag = string_tag[2:]

        if curr_bio_tag not in ["B", "I", "O"]:
            raise InvalidTagSequence(tag_sequence)
        if curr_bio_tag == "O" or curr_conll_tag in classes_to_ignore:
            # The span has ended.
            if active_conll_tag is not None:
                spans.add((active_conll_tag, (span_start, span_end)))
            active_conll_tag = None
        elif _iob1_start_of_chunk(prev_bio_tag, prev_conll_tag, curr_bio_tag, curr_conll_tag):
            # We are entering a new span; reset indices
            # and active tag to new span.
            if active_conll_tag is not None:
                spans.add((active_conll_tag, (span_start, span_end)))
            active_conll_tag = curr_conll_tag
            span_start = index
            span_end = index
        else:
            # bio_tag == "I" and curr_conll_tag == active_conll_tag
            # We're continuing a span.
            span_end += 1

        prev_bio_tag = string_tag[0]
        prev_conll_tag = string_tag[2:]
    # Last token might have been a part of a valid span.
    if active_conll_tag is not None:
        spans.add((active_conll_tag, (span_start, span_end)))
    return list(spans)


def _iob1_start_of_chunk(
        prev_bio_tag: Optional[str],
        prev_conll_tag: Optional[str],
        curr_bio_tag: str,
        curr_conll_tag: str,
) -> bool:
    if curr_bio_tag == "B":
        return True
    if curr_bio_tag == "I" and prev_bio_tag == "O":
        return True
    if curr_bio_tag != "O" and prev_conll_tag != curr_conll_tag:
        return True
    return False


def bioul_tags_to_spans(
        tag_sequence: List[str], classes_to_ignore: List[str] = None
) -> List[TypedStringSpan]:
    """
    Given a sequence corresponding to BIOUL tags, extracts spans.
    Spans are inclusive and can be of zero length, representing a single word span.
    Ill-formed spans are not allowed and will raise `InvalidTagSequence`.
    This function works properly when the spans are unlabeled (i.e., your labels are
    simply "B", "I", "O", "U", and "L").

    # Parameters

    tag_sequence : `List[str]`, required.
        The tag sequence encoded in BIOUL, e.g. ["B-PER", "L-PER", "O"].
    classes_to_ignore : `List[str]`, optional (default = `None`).
        A list of string class labels `excluding` the bio tag
        which should be ignored when extracting spans.

    # Returns

    spans : `List[TypedStringSpan]`
        The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)).
    """
    spans = []
    classes_to_ignore = classes_to_ignore or []
    index = 0
    while index < len(tag_sequence):
        label = tag_sequence[index]
        if label[0] == "U":
            spans.append((label.partition("-")[2], (index, index)))
        elif label[0] == "B":
            start = index
            while label[0] != "L":
                index += 1
                if index >= len(tag_sequence):
                    raise InvalidTagSequence(tag_sequence)
                label = tag_sequence[index]
                if not (label[0] == "I" or label[0] == "L"):
                    raise InvalidTagSequence(tag_sequence)
            spans.append((label.partition("-")[2], (start, index)))
        else:
            if label != "O":
                raise InvalidTagSequence(tag_sequence)
        index += 1
    return [span for span in spans if span[0] not in classes_to_ignore]


def iobes_tags_to_spans(
        tag_sequence: List[str], classes_to_ignore: List[str] = None
) -> List[TypedStringSpan]:
    """
    Given a sequence corresponding to BIOUL tags, extracts spans.
    Spans are inclusive and can be of zero length, representing a single word span.
    Ill-formed spans are not allowed and will raise `InvalidTagSequence`.
    This function works properly when the spans are unlabeled (i.e., your labels are
    simply "B", "I", "O", "U", and "L").

    # Parameters

    tag_sequence : `List[str]`, required.
        The tag sequence encoded in BIOUL, e.g. ["B-PER", "L-PER", "O"].
    classes_to_ignore : `List[str]`, optional (default = `None`).
        A list of string class labels `excluding` the bio tag
        which should be ignored when extracting spans.

    # Returns

    spans : `List[TypedStringSpan]`
        The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)).
    """
    spans = []
    classes_to_ignore = classes_to_ignore or []
    index = 0
    while index < len(tag_sequence):
        label = tag_sequence[index]
        if label[0] == "S":
            spans.append((label.partition("-")[2], (index, index)))
        elif label[0] == "B":
            start = index
            while label[0] != "E":
                index += 1
                if index >= len(tag_sequence):
                    raise InvalidTagSequence(tag_sequence)
                label = tag_sequence[index]
                if not (label[0] == "I" or label[0] == "E"):
                    raise InvalidTagSequence(tag_sequence)
            spans.append((label.partition("-")[2], (start, index)))
        else:
            if label != "O":
                raise InvalidTagSequence(tag_sequence)
        index += 1
    return [span for span in spans if span[0] not in classes_to_ignore]


def iob1_to_bioul(tag_sequence: List[str]) -> List[str]:
    warnings.warn(
        "iob1_to_bioul has been replaced with 'to_bioul' to allow more encoding options.",
        FutureWarning,
    )
    return to_bioul(tag_sequence)


def to_bioul(tag_sequence: List[str], encoding: str = "IOB1") -> List[str]:
    """
    Given a tag sequence encoded with IOB1 labels, recode to BIOUL.

    In the IOB1 scheme, I is a token inside a span, O is a token outside
    a span and B is the beginning of span immediately following another
    span of the same type.

    In the BIO scheme, I is a token inside a span, O is a token outside
    a span and B is the beginning of a span.

    # Parameters

    tag_sequence : `List[str]`, required.
        The tag sequence encoded in IOB1, e.g. ["I-PER", "I-PER", "O"].
    encoding : `str`, optional, (default = `"IOB1"`).
        The encoding type to convert from. Must be either "IOB1" or "BIO".

    # Returns

    bioul_sequence : `List[str]`
        The tag sequence encoded in IOB1, e.g. ["B-PER", "L-PER", "O"].
    """
    if encoding not in {"IOB1", "BIO"}:
        raise ValueError(f"Invalid encoding {encoding} passed to 'to_bioul'.")

    def replace_label(full_label, new_label):
        # example: full_label = 'I-PER', new_label = 'U', returns 'U-PER'
        parts = list(full_label.partition("-"))
        parts[0] = new_label
        return "".join(parts)

    def pop_replace_append(in_stack, out_stack, new_label):
        # pop the last element from in_stack, replace the label, append
        # to out_stack
        tag = in_stack.pop()
        new_tag = replace_label(tag, new_label)
        out_stack.append(new_tag)

    def process_stack(stack, out_stack):
        # process a stack of labels, add them to out_stack
        if len(stack) == 1:
            # just a U token
            pop_replace_append(stack, out_stack, "U")
        else:
            # need to code as BIL
            recoded_stack = []
            pop_replace_append(stack, recoded_stack, "L")
            while len(stack) >= 2:
                pop_replace_append(stack, recoded_stack, "I")
            pop_replace_append(stack, recoded_stack, "B")
            recoded_stack.reverse()
            out_stack.extend(recoded_stack)

    # Process the tag_sequence one tag at a time, adding spans to a stack,
    # then recode them.
    bioul_sequence = []
    stack: List[str] = []

    for label in tag_sequence:
        # need to make a dict like
        # token = {'token': 'Matt', "labels": {'conll2003': "B-PER"}
        #                   'gold': 'I-PER'}
        # where 'gold' is the raw value from the CoNLL data set

        if label == "O" and len(stack) == 0:
            bioul_sequence.append(label)
        elif label == "O" and len(stack) > 0:
            # need to process the entries on the stack plus this one
            process_stack(stack, bioul_sequence)
            bioul_sequence.append(label)
        elif label[0] == "I":
            # check if the previous type is the same as this one
            # if it is then append to stack
            # otherwise this start a new entity if the type
            # is different
            if len(stack) == 0:
                if encoding == "BIO":
                    raise InvalidTagSequence(tag_sequence)
                stack.append(label)
            else:
                # check if the previous type is the same as this one
                this_type = label.partition("-")[2]
                prev_type = stack[-1].partition("-")[2]
                if this_type == prev_type:
                    stack.append(label)
                else:
                    if encoding == "BIO":
                        raise InvalidTagSequence(tag_sequence)
                    # a new entity
                    process_stack(stack, bioul_sequence)
                    stack.append(label)
        elif label[0] == "B":
            if len(stack) > 0:
                process_stack(stack, bioul_sequence)
            stack.append(label)
        else:
            raise InvalidTagSequence(tag_sequence)

    # process the stack
    if len(stack) > 0:
        process_stack(stack, bioul_sequence)

    return bioul_sequence


def bmes_tags_to_spans(
        tag_sequence: List[str], classes_to_ignore: List[str] = None
) -> List[TypedStringSpan]:
    """
    Given a sequence corresponding to BMES tags, extracts spans.
    Spans are inclusive and can be of zero length, representing a single word span.
    Ill-formed spans are also included (i.e those which do not start with a "B-LABEL"),
    as otherwise it is possible to get a perfect precision score whilst still predicting
    ill-formed spans in addition to the correct spans.
    This function works properly when the spans are unlabeled (i.e., your labels are
    simply "B", "M", "E" and "S").

    # Parameters

    tag_sequence : `List[str]`, required.
        The integer class labels for a sequence.
    classes_to_ignore : `List[str]`, optional (default = `None`).
        A list of string class labels `excluding` the bio tag
        which should be ignored when extracting spans.

    # Returns

    spans : `List[TypedStringSpan]`
        The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)).
        Note that the label `does not` contain any BIO tag prefixes.
    """

    def extract_bmes_tag_label(text):
        bmes_tag = text[0]
        label = text[2:]
        return bmes_tag, label

    spans: List[Tuple[str, List[int]]] = []
    prev_bmes_tag: Optional[str] = None
    for index, tag in enumerate(tag_sequence):
        bmes_tag, label = extract_bmes_tag_label(tag)
        if bmes_tag in ("B", "S"):
            # Regardless of tag, we start a new span when reaching B & S.
            spans.append((label, [index, index]))
        elif bmes_tag in ("M", "E") and prev_bmes_tag in ("B", "M") and spans[-1][0] == label:
            # Only expand the span if
            # 1. Valid transition: B/M -> M/E.
            # 2. Matched label.
            spans[-1][1][1] = index
        else:
            # Best effort split for invalid span.
            spans.append((label, [index, index]))
        # update previous BMES tag.
        prev_bmes_tag = bmes_tag

    classes_to_ignore = classes_to_ignore or []
    return [
        # to tuple.
        (span[0], (span[1][0], span[1][1]))
        for span in spans
        if span[0] not in classes_to_ignore
    ]

================================================
FILE: hanlp/utils/string_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-25 00:19
import unicodedata
from typing import List, Dict, Tuple


def format_scores(results: Dict[str, float]) -> str:
    return ' - '.join(f'{k}: {v:.4f}' for (k, v) in results.items())


def ispunct(token):
    return all(unicodedata.category(char).startswith('P')
               for char in token)


def split_long_sentence_into(tokens: List[str], max_seq_length, sent_delimiter=None, char_level=False,
                             hard_constraint=False):
    punct_offset = [i for i, x in enumerate(tokens) if
                    ((sent_delimiter and x in sent_delimiter) or (not sent_delimiter and ispunct(x)))]
    if not punct_offset:
        # treat every token as punct
        punct_offset = [i for i in range(len(tokens))]
    punct_offset += [len(tokens)]
    token_to_char_offset = []
    if char_level:
        offset = 0
        for token in tokens:
            token_to_char_offset.append(offset)
            offset += len(token)
        token_to_char_offset.append(offset)

    start = 0
    for i, offset in enumerate(punct_offset[:-1]):
        end = punct_offset[i + 1]
        length_at_next_punct = _len(start, end, token_to_char_offset, char_level)
        if length_at_next_punct >= max_seq_length:
            if hard_constraint:
                yield from _gen_short_sent(tokens, start, offset, max_seq_length, token_to_char_offset, char_level)
            else:
                yield tokens[start: offset + 1]
            start = offset + 1
    offset = punct_offset[-1]
    if start < offset:
        offset -= 1
        length_at_next_punct = _len(start, offset, token_to_char_offset, char_level)
        if length_at_next_punct >= max_seq_length and hard_constraint:
            yield from _gen_short_sent(tokens, start, offset, max_seq_length, token_to_char_offset, char_level)
        else:
            yield tokens[start:]


def _gen_short_sent(tokens, start, offset, max_seq_length, token_to_char_offset, char_level):
    while start <= offset:
        for j in range(offset + 1, start, -1):
            if _len(start, j, token_to_char_offset, char_level) <= max_seq_length or j == start + 1:
                yield tokens[start: j]
                start = j
                break


def _len(start, end, token_to_char_offset, char_level):
    if char_level:
        length_at_next_punct = token_to_char_offset[end] - token_to_char_offset[start]
    else:
        length_at_next_punct = end - start
    return length_at_next_punct


def guess_delimiter(tokens):
    if all(ord(c) < 128 for c in ''.join(tokens)):
        delimiter_in_entity = ' '
    else:
        delimiter_in_entity = ''
    return delimiter_in_entity


def split_long_sent(sent, delimiters, max_seq_length):
    parts = []
    offset = 0
    for idx, char in enumerate(sent):
        if char in delimiters:
            parts.append(sent[offset:idx + 1])
            offset = idx + 1
    if not parts:
        yield sent
        return
    short = []
    for idx, part in enumerate(parts):
        short += part
        if idx == len(parts) - 1:
            yield short
        else:
            if len(short) + len(parts[idx + 1]) > max_seq_length:
                yield short
                short = []


def possible_tokenization(text: str) -> List[Tuple[str]]:
    """Enumerate all possible tokenizations of a text.

    Args:
        text: A text.

    Returns: All possible tokenizations.

    """
    states = [((), ())]
    for c in text:
        new_states = []
        for t, b in states:
            # to split
            new_states.append((t + (''.join(b + (c,)),), ()))
            # not to split
            new_states.append((t, b + (c,)))
        states = new_states
    return [t for t, b in states if not b]


================================================
FILE: hanlp/utils/tf_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-27 01:27
import json
import logging
import os
import random
from typing import List

import numpy as np

from hanlp_common.constant import PAD


def set_gpu(idx=0):
    """Restrict TensorFlow to only use the GPU of idx

    Args:
      idx:  (Default value = 0)

    Returns:

    
    """
    gpus = get_visible_gpus()
    if gpus:
        try:
            tf.config.experimental.set_visible_devices(gpus[idx], 'GPU')
            logical_devices = tf.config.experimental.list_logical_devices('GPU')
            assert len(logical_devices) == 1
        except RuntimeError as e:
            # Virtual devices must be set before GPUs have been initialized
            # print(e)
            raise e


def get_visible_gpus():
    gpus = tf.config.experimental.list_physical_devices('GPU')
    return gpus


def set_gpu_memory_growth(growth=True):
    gpus = get_visible_gpus()
    if gpus:
        try:
            # Currently, memory growth needs to be the same across GPUs
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, growth)
        except RuntimeError as e:
            # Memory growth must be set before GPUs have been initialized
            # print(e)
            raise e


def nice_gpu():
    """Use GPU nicely."""
    set_gpu_memory_growth()
    set_gpu()


def shut_up_python_logging():
    logging.getLogger('tensorflow').setLevel(logging.ERROR)
    import absl.logging
    logging.root.removeHandler(absl.logging._absl_handler)
    absl.logging._warn_preinit_stderr = False


def set_tf_loglevel(level=logging.ERROR):
    if level >= logging.FATAL:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
        os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3'
    if level >= logging.ERROR:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
        os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '2'
    if level >= logging.WARNING:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
        os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '1'
    else:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'
        os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '0'
    shut_up_python_logging()
    logging.getLogger('tensorflow').setLevel(level)


set_tf_loglevel()

shut_up_python_logging()
import tensorflow as tf

nice_gpu()


def size_of_dataset(dataset: tf.data.Dataset) -> int:
    count = 0
    for element in dataset.unbatch().batch(1):
        count += 1
    return count


def summary_of_model(model: tf.keras.Model):
    """https://stackoverflow.com/a/53668338/3730690

    Args:
      model: tf.keras.Model: 

    Returns:

    
    """
    if not model.built:
        return 'model structure unknown until calling fit() with some data'
    line_list = []
    model.summary(print_fn=lambda x: line_list.append(x))
    summary = "\n".join(line_list)
    return summary


def register_custom_cls(custom_cls, name=None):
    if not name:
        name = custom_cls.__name__
    tf.keras.utils.get_custom_objects()[name] = custom_cls


def set_seed_tf(seed=233):
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)


def nice():
    nice_gpu()
    set_seed_tf()


def hanlp_register(arg):
    """Registers a class with the Keras serialization framework.

    Args:
      arg: 

    Returns:

    """
    class_name = arg.__name__
    registered_name = 'HanLP' + '>' + class_name

    # if tf_inspect.isclass(arg) and not hasattr(arg, 'get_config'):
    #     raise ValueError(
    #         'Cannot register a class that does not have a get_config() method.')

    tf.keras.utils.get_custom_objects()[registered_name] = arg

    return arg


def tensor_is_eager(tensor: tf.Tensor):
    return hasattr(tensor, 'numpy')


def copy_mask(src: tf.Tensor, dst: tf.Tensor):
    mask = getattr(src, '_keras_mask', None)
    if mask is not None:
        dst._keras_mask = mask
    return mask


def get_callback_by_class(callbacks: List[tf.keras.callbacks.Callback], cls) -> tf.keras.callbacks.Callback:
    for callback in callbacks:
        if isinstance(callback, cls):
            return callback


def tf_bernoulli(shape, p, dtype=None):
    return tf.keras.backend.random_binomial(shape, p, dtype)


def str_tensor_to_str(str_tensor: tf.Tensor) -> str:
    return str_tensor.numpy().decode('utf-8')


def str_tensor_2d_to_list(str_tensor: tf.Tensor, pad=PAD) -> List[List[str]]:
    l = []
    for i in str_tensor:
        sent = []
        for j in i:
            j = str_tensor_to_str(j)
            if j == pad:
                break
            sent.append(j)
        l.append(sent)
    return l


def str_tensor_to_list(pred):
    return [tag.predict('utf-8') for tag in pred]


def format_metrics(metrics: List[tf.keras.metrics.Metric]):
    return ' - '.join(f'{m.name}: {m.result():.4f}' for m in metrics)


class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        """Special json encoder for numpy types
        See https://interviewbubble.com/typeerror-object-of-type-float32-is-not-json-serializable/

        Args:
            obj: Object to be json encoded.

        Returns:
            Json string.
        """
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
                            np.int16, np.int32, np.int64, np.uint8,
                            np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32,
                              np.float64)):
            return float(obj)
        elif isinstance(obj, (np.ndarray,)):  #### This is the fix
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

================================================
FILE: hanlp/utils/time_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-27 00:01
import datetime
import logging
import sys
import time
from typing import Union

from hanlp.utils.log_util import ErasablePrinter, color_format, color_format_len


def human_time_delta(days, hours, minutes, seconds, delimiter=' ') -> str:
    units = locals().copy()
    units.pop('delimiter')
    non_zero = False
    result = []
    for key, val in sorted(units.items()):
        append = False
        if non_zero:
            append = True
        elif val:
            non_zero = True
            append = True
        if append:
            result.append('{} {}'.format(val, key[0]))
    if not non_zero:
        return '0 s'
    return delimiter.join(result)


def seconds_to_time_delta(seconds):
    seconds = round(seconds)
    days = seconds // 86400
    hours = seconds // 3600 % 24
    minutes = seconds // 60 % 60
    seconds = seconds % 60
    return days, hours, minutes, seconds


def report_time_delta(seconds, human=True):
    days, hours, minutes, seconds = seconds_to_time_delta(seconds)
    if human:
        return human_time_delta(days, hours, minutes, seconds)
    return days, hours, minutes, seconds


class HumanTimeDelta(object):

    def __init__(self, delta_seconds) -> None:
        super().__init__()
        self.delta_seconds = delta_seconds

    def report(self, human=True):
        return report_time_delta(self.delta_seconds, human)

    def __str__(self) -> str:
        return self.report(human=True)

    def __truediv__(self, scalar):
        return HumanTimeDelta(self.delta_seconds / scalar)


class CountdownTimer(ErasablePrinter):

    def __init__(self, total: int, out=sys.stdout) -> None:
        super().__init__(out=out)
        self.total = total
        self.current = 0
        self.start = time.time()
        self.finished_in = None
        self.last_log_time = 0

    def update(self, n=1):
        self.current += n
        self.current = min(self.total, self.current)
        if self.current == self.total:
            self.finished_in = time.time() - self.start

    @property
    def ratio(self) -> str:
        return f'{self.current}/{self.total}'

    @property
    def ratio_percentage(self) -> str:
        return f'{self.current / self.total:.2%}'

    @property
    def eta(self) -> float:
        elapsed = self.elapsed
        if self.finished_in:
            eta = 0
        else:
            eta = elapsed / max(self.current, 0.1) * (self.total - self.current)

        return eta

    @property
    def elapsed(self) -> float:
        if self.finished_in:
            elapsed = self.finished_in
        else:
            elapsed = time.time() - self.start
        return elapsed

    @property
    def elapsed_human(self) -> str:
        return human_time_delta(*seconds_to_time_delta(self.elapsed))

    @property
    def elapsed_average(self) -> float:
        return self.elapsed / self.current

    @property
    def elapsed_average_human(self) -> str:
        return human_time_delta(*seconds_to_time_delta(self.elapsed_average))

    @property
    def eta_human(self) -> str:
        return human_time_delta(*seconds_to_time_delta(self.eta))

    @property
    def total_time(self) -> float:
        elapsed = self.elapsed
        if self.finished_in:
            t = self.finished_in
        else:
            t = elapsed / max(self.current, 1) * self.total

        return t

    @property
    def total_time_human(self) -> str:
        return human_time_delta(*seconds_to_time_delta(self.total_time))

    def stop(self, total=None):
        if not self.finished_in or total:
            self.finished_in = time.time() - self.start
            if not total:
                self.total = self.current
            else:
                self.current = total
                self.total = total

    @property
    def et_eta(self):
        _ = self.elapsed
        if self.finished_in:
            return self.elapsed
        else:
            return self.eta

    @property
    def et_eta_human(self):
        text = human_time_delta(*seconds_to_time_delta(self.et_eta))
        if self.finished_in:
            return f'ET: {text}'
        else:
            return f'ETA: {text}'

    @property
    def finished(self):
        return self.total == self.current

    def log(self, info=None, ratio_percentage=True, ratio=True, step=1, interval=0.5, erase=True,
            logger: Union[logging.Logger, bool] = None, newline=False, ratio_width=None):
        self.update(step)
        now = time.time()
        if now - self.last_log_time > interval or self.finished:
            cells = []
            if ratio_percentage:
                cells.append(self.ratio_percentage)
            if ratio:
                ratio = self.ratio
                if not ratio_width:
                    ratio_width = self.ratio_width
                ratio = ratio.rjust(ratio_width)
                cells.append(ratio)
            cells += [info, self.et_eta_human]
            cells = [x for x in cells if x]
            msg = f'{" ".join(cells)}'
            self.last_log_time = now
            self.print(msg, newline, erase, logger)

    @property
    def ratio_width(self) -> int:
        return len(f'{self.total}') * 2 + 1

    def print(self, msg, newline=False, erase=True, logger=None):
        self.erase()
        msg_len = 0 if newline else len(msg)
        if self.finished and logger:
            sys.stdout.flush()
            if isinstance(logger, logging.Logger):
                logger.info(msg)
        else:
            msg, msg_len = color_format_len(msg)
            sys.stdout.write(msg)
            if newline:
                sys.stdout.write('\n')
                msg_len = 0
        self._last_print_width = msg_len
        if self.finished and not logger:
            if erase:
                self.erase()
            else:
                sys.stdout.write("\n")
                self._last_print_width = 0
        sys.stdout.flush()


class Timer(object):
    def __init__(self) -> None:
        self.last = time.time()

    def start(self):
        self.last = time.time()

    def stop(self) -> HumanTimeDelta:
        now = time.time()
        seconds = now - self.last
        self.last = now
        return HumanTimeDelta(seconds)


def now_human(year='y'):
    now = datetime.datetime.now()
    return now.strftime(f"%{year}-%m-%d %H:%M:%S")


def now_datetime():
    return now_human('Y')


def now_filename(fmt="%y%m%d_%H%M%S"):
    """Generate filename using current datetime, in 20180102_030405 format

    Args:
      fmt:  (Default value = "%y%m%d_%H%M%S")

    Returns:

    
    """
    now = datetime.datetime.now()
    return now.strftime(fmt)


================================================
FILE: hanlp/utils/torch_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-09 15:52
import os
import random
import time
from typing import List, Union, Dict, Tuple

import numpy as np
import torch
from pynvml import nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlInit, nvmlShutdown, nvmlDeviceGetCount
from torch import nn
from torch.nn.utils.rnn import pad_sequence

from hanlp.utils.io_util import get_resource, replace_ext, TimingFileIterator
from hanlp.utils.log_util import logger, flash
from hanlp_common.constant import HANLP_VERBOSE
from hanlp_common.io import load_pickle, save_pickle


def gpus_available() -> Dict[int, float]:
    if not torch.cuda.is_available():
        return dict()
    try:
        nvmlInit()
        gpus = {}
        visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES', None)
        if visible_devices is None:
            visible_devices = list(range(nvmlDeviceGetCount()))
        else:
            visible_devices = {int(x.strip()) for x in visible_devices.split(',')}
        for i, real_id in enumerate(visible_devices):
            h = nvmlDeviceGetHandleByIndex(real_id)
            info = nvmlDeviceGetMemoryInfo(h)
            total = info.total
            free = info.free
            ratio = free / total
            gpus[i] = ratio
            # print(f'total    : {info.total}')
            # print(f'free     : {info.free}')
            # print(f'used     : {info.used}')
            # t = torch.cuda.get_device_properties(0).total_memory
            # c = torch.cuda.memory_cached(0)
            # a = torch.cuda.memory_allocated(0)
            # print(t, c, a)
        nvmlShutdown()
        return dict(sorted(gpus.items(), key=lambda x: x[1], reverse=True))
    except Exception as e:
        logger.debug(f'Failed to get gpu info due to {e}')
        return dict((i, 1.0) for i in range(torch.cuda.device_count()))


def cuda_devices(query=None) -> List[int]:
    """Decide which GPUs to use

    Args:
      query:  (Default value = None)

    Returns:

    
    """
    if isinstance(query, list):
        if len(query) == 0:
            return [-1]
        return query
    if query is None:
        query = gpus_available()
        if not query:
            return []
        size, idx = max((v, k) for k, v in query.items())
        # When multiple GPUs have the same size, randomly pick one to avoid conflicting
        gpus_with_same_size = [k for k, v in query.items() if v == size]
        query = random.choice(gpus_with_same_size)
    if isinstance(query, float):
        gpus = gpus_available()
        if not query:
            return []
        query = [k for k, v in gpus.items() if v > query]
    elif isinstance(query, int):
        query = [query]
    return query


def pad_lists(sequences: List[List], dtype=torch.long, padding_value=0):
    return pad_sequence([torch.tensor(x, dtype=dtype) for x in sequences], True, padding_value)


def set_seed(seed=233, dont_care_speed=False):
    """Copied from https://github.com/huggingface/transformers/blob/7b75aa9fa55bee577e2c7403301ed31103125a35/src/transformers/trainer.py#L76

    Args:
      seed:  (Default value = 233)
      dont_care_speed: True may have a negative single-run performance impact, but ensures deterministic

    Returns:

    
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    # ^^ safe to call this function even if cuda is not available
    torch.cuda.manual_seed_all(seed)
    if dont_care_speed:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


def batched_index_select(input, index, dim=1):
    """

    Args:
      input: B x * x ... x *
      index: B x M
      dim:  (Default value = 1)

    Returns:

    
    """
    views = [input.shape[0]] + [1 if i != dim else -1 for i in range(1, len(input.shape))]
    expanse = list(input.shape)
    expanse[0] = -1
    expanse[dim] = -1
    index = index.view(views).expand(expanse)
    return torch.gather(input, dim, index)


def truncated_normal_(tensor, mean=0, std=1):
    size = tensor.shape
    tmp = tensor.new_empty(size + (4,)).normal_()
    valid = (tmp < 2) & (tmp > -2)
    ind = valid.max(-1, keepdim=True)[1]
    tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1))
    tensor.data.mul_(std).add_(mean)
    return tensor


def dtype_of(e: Union[int, bool, float]):
    if isinstance(e, bool):
        return torch.bool
    if isinstance(e, int):
        return torch.long
    if isinstance(e, float):
        return torch.float
    raise ValueError(f'Unsupported type of {repr(e)}')


def mean_model(model: torch.nn.Module):
    return float(torch.mean(torch.stack([torch.sum(p) for p in model.parameters() if p.requires_grad])))


def main():
    start = time.time()
    print(gpus_available())
    print(time.time() - start)
    # print(gpus_available())
    # print(cuda_devices())
    # print(cuda_devices(0.1))


if __name__ == '__main__':
    main()


def clip_grad_norm(model: nn.Module, grad_norm, transformer: nn.Module = None, transformer_grad_norm=None):
    if transformer_grad_norm is None:
        if grad_norm is not None:
            nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, model.parameters()), grad_norm)
    else:
        is_transformer = []
        non_transformer = []
        transformer = set(transformer.parameters())
        for p in model.parameters():
            if not p.requires_grad:
                continue
            if p in transformer:
                is_transformer.append(p)
            else:
                non_transformer.append(p)
        nn.utils.clip_grad_norm_(non_transformer, grad_norm)
        nn.utils.clip_grad_norm_(is_transformer, transformer_grad_norm)


def load_word2vec(path, delimiter=' ', cache=True) -> Tuple[Dict[str, np.ndarray], int]:
    realpath = get_resource(path)
    binpath = replace_ext(realpath, '.pkl')
    if cache:
        try:
            flash('Loading word2vec from cache [blink][yellow]...[/yellow][/blink]')
            word2vec, dim = load_pickle(binpath)
            flash('')
            return word2vec, dim
        except IOError:
            pass

    dim = None
    word2vec = dict()
    f = TimingFileIterator(realpath)
    for idx, line in enumerate(f):
        f.log('Loading word2vec from text file [blink][yellow]...[/yellow][/blink]')
        line = line.rstrip().split(delimiter)
        if len(line) > 2:
            if dim is None:
                dim = len(line)
            else:
                if len(line) != dim:
                    logger.warning('{}#{} length mismatches with {}'.format(path, idx + 1, dim))
                    continue
            word, vec = line[0], line[1:]
            word2vec[word] = np.array(vec, dtype=np.float32)
    dim -= 1
    if cache:
        flash('Caching word2vec [blink][yellow]...[/yellow][/blink]')
        save_pickle((word2vec, dim), binpath)
        flash('')
    return word2vec, dim


def load_word2vec_as_vocab_tensor(path, delimiter=' ', cache=True) -> Tuple[Dict[str, int], torch.Tensor]:
    realpath = get_resource(path)
    vocab_path = replace_ext(realpath, '.vocab')
    matrix_path = replace_ext(realpath, '.pt')
    if cache:
        try:
            if HANLP_VERBOSE:
                flash('Loading vocab and matrix from cache [blink][yellow]...[/yellow][/blink]')
            vocab = load_pickle(vocab_path)
            matrix = torch.load(matrix_path, map_location='cpu')
            if HANLP_VERBOSE:
                flash('')
            return vocab, matrix
        except IOError:
            pass

    word2vec, dim = load_word2vec(path, delimiter, cache)
    vocab = dict((k, i) for i, k in enumerate(word2vec.keys()))
    matrix = torch.Tensor(np.stack(list(word2vec.values())))
    if cache:
        flash('Caching vocab and matrix [blink][yellow]...[/yellow][/blink]')
        save_pickle(vocab, vocab_path)
        torch.save(matrix, matrix_path)
        flash('')
    return vocab, matrix


def save_word2vec(word2vec: dict, filepath, delimiter=' '):
    with open(filepath, 'w', encoding='utf-8') as out:
        for w, v in word2vec.items():
            out.write(f'{w}{delimiter}')
            out.write(f'{delimiter.join(str(x) for x in v)}\n')


def lengths_to_mask(seq_len, max_len=None):
    r"""
    .. code-block::

        >>> seq_len = torch.arange(2, 16)
        >>> mask = lengths_to_mask(seq_len)
        >>> print(mask.size())
        torch.Size([14, 15])
        >>> seq_len = np.arange(2, 16)
        >>> mask = lengths_to_mask(seq_len)
        >>> print(mask.shape)
        (14, 15)
        >>> seq_len = torch.arange(2, 16)
        >>> mask = lengths_to_mask(seq_len, max_len=100)
        >>>print(mask.size())
        torch.Size([14, 100])

    :param torch.LongTensor seq_len: (B,)
    :param int max_len: max sequence length。
    :return:  torch.Tensor  (B, max_len)
    """
    assert seq_len.dim() == 1, f"seq_len can only have one dimension, got {seq_len.dim() == 1}."
    batch_size = seq_len.size(0)
    max_len = int(max_len) if max_len else seq_len.max().long()
    broad_cast_seq_len = torch.arange(max_len).expand(batch_size, -1).to(seq_len)
    mask = broad_cast_seq_len.lt(seq_len.unsqueeze(1))

    return mask


def activation_from_name(name: str):
    return getattr(torch.nn, name)


def filter_state_dict_safely(model_state: dict, load_state: dict):
    safe_state = dict()
    for k, v in load_state.items():
        model_v = model_state.get(k, None)
        if model_v is not None and model_v.shape == v.shape:
            safe_state[k] = v
    return safe_state


================================================
FILE: hanlp/version.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 19:26

__version__ = '2.1.0-beta.64'
"""HanLP version"""


class NotCompatible(Exception):
    pass


================================================
FILE: plugins/README.md
================================================
# Plugins for HanLP

This directory contains modules shared across several individual packages or non core APIs.
If you plan to submit any plugins, please put it here too.

For developers, run the following set-up.

```bash
pip install -e hanlp_trie
pip install -e hanlp_common
pip install -e hanlp_restful
```

================================================
FILE: plugins/hanlp_common/README.md
================================================
# Common utilities and structures for HanLP

[中文](https://github.com/hankcs/HanLP/tree/doc-zh) | [1.x](https://github.com/hankcs/HanLP/tree/1.x) | [forum](https://bbs.hankcs.com/) | [docker](https://github.com/WalterInSH/hanlp-jupyter-docker)

The multilingual NLP library for researchers and companies, built on PyTorch and TensorFlow 2.x, for advancing state-of-the-art deep learning techniques in both academia and industry. HanLP was designed from day one to be efficient, user friendly and extendable. It comes with pretrained models for various human languages including English, Chinese and many others. Currently, HanLP 2.0 is in alpha stage with more killer features on the roadmap. Discussions are welcomed on our [forum](https://bbs.hankcs.com/), while bug reports and feature requests are reserved for GitHub issues. For Java users, please checkout the [1.x](https://github.com/hankcs/HanLP/tree/1.x) branch.


## Installation

```bash
pip install hanlp
```

## License

HanLP is licensed under **Apache License 2.0**. You can use HanLP in your commercial products for free. We would appreciate it if you add a link to HanLP on your website.


================================================
FILE: plugins/hanlp_common/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-16 22:20


================================================
FILE: plugins/hanlp_common/hanlp_common/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-16 22:21


================================================
FILE: plugins/hanlp_common/hanlp_common/amr.py
================================================
# MIT License
#
# Copyright (c) 2019 Sheng Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import json
import logging
import re
import traceback
from collections import Counter, defaultdict

from hanlp_common.io import eprint

try:
    import networkx as nx
    import penman
    from penman import Triple
except ModuleNotFoundError:
    traceback.print_exc()
    eprint('AMR support requires the full version which can be installed via:\n'
           'pip install hanlp_common[full]')
    exit(1)

DEFAULT_PADDING_TOKEN = "@@PADDING@@"
DEFAULT_OOV_TOKEN = "@@UNKNOWN@@"
logger = logging.getLogger('amr')

# Disable inverting ':mod' relation.
penman.AMRCodec._inversions.pop('domain')
penman.AMRCodec._deinversions.pop('mod')

amr_codec = penman.AMRCodec(indent=6)

WORDSENSE_RE = re.compile(r'-\d\d$')
QUOTED_RE = re.compile(r'^".*"$')


def is_abstract_token(token):
    return re.search(r'^([A-Z]+_)+\d+$', token) or re.search(r'^\d0*$', token)


def is_english_punct(c):
    return re.search(r'^[,.?!:;"\'-(){}\[\]]$', c)


def find_similar_token(token, tokens):
    token = re.sub(r'-\d\d$', '', token)  # .lower())
    for i, t in enumerate(tokens):
        if token == t:
            return tokens[i]
        # t = t.lower()
        # if (token == t or
        #     (t.startswith(token) and len(token) > 3) or
        #     token + 'd' == t or
        #     token + 'ed' == t or
        #     re.sub('ly$', 'le', t) == token or
        #     re.sub('tive$', 'te', t) == token or
        #     re.sub('tion$', 'te', t) == token or
        #     re.sub('ied$', 'y', t) == token or
        #     re.sub('ly$', '', t) == token
        # ):
        #     return tokens[i]
    return None


class AMR:

    def __init__(self,
                 id=None,
                 sentence=None,
                 graph=None,
                 tokens=None,
                 lemmas=None,
                 pos_tags=None,
                 ner_tags=None,
                 abstract_map=None,
                 misc=None):
        self.id = id
        self.sentence = sentence
        self.graph = graph
        self.tokens = tokens
        self.lemmas = lemmas
        self.pos_tags = pos_tags
        self.ner_tags = ner_tags
        self.abstract_map = abstract_map
        self.misc = misc

    def is_named_entity(self, index):
        return self.ner_tags[index] not in ('0', 'O')

    def get_named_entity_span(self, index):
        if self.ner_tags is None or not self.is_named_entity(index):
            return []
        span = [index]
        tag = self.ner_tags[index]
        prev = index - 1
        while prev > 0 and self.ner_tags[prev] == tag:
            span.append(prev)
            prev -= 1
        next = index + 1
        while next < len(self.ner_tags) and self.ner_tags[next] == tag:
            span.append(next)
            next += 1
        return span

    def find_span_indexes(self, span):
        for i, token in enumerate(self.tokens):
            if token == span[0]:
                _span = self.tokens[i: i + len(span)]
                if len(_span) == len(span) and all(x == y for x, y in zip(span, _span)):
                    return list(range(i, i + len(span)))
        return None

    def replace_span(self, indexes, new, pos=None, ner=None):
        self.tokens = self.tokens[:indexes[0]] + new + self.tokens[indexes[-1] + 1:]
        self.lemmas = self.lemmas[:indexes[0]] + new + self.lemmas[indexes[-1] + 1:]
        if pos is None:
            pos = [self.pos_tags[indexes[0]]]
        self.pos_tags = self.pos_tags[:indexes[0]] + pos + self.pos_tags[indexes[-1] + 1:]
        if ner is None:
            ner = [self.ner_tags[indexes[0]]]
        self.ner_tags = self.ner_tags[:indexes[0]] + ner + self.ner_tags[indexes[-1] + 1:]

    def remove_span(self, indexes):
        self.replace_span(indexes, [], [], [])

    def __repr__(self):
        fields = []
        for k, v in dict(
                id=self.id,
                snt=self.sentence,
                tokens=self.tokens,
                lemmas=self.lemmas,
                pos_tags=self.pos_tags,
                ner_tags=self.ner_tags,
                abstract_map=self.abstract_map,
                misc=self.misc,
                graph=self.graph
        ).items():
            if v is None:
                continue
            if k == 'misc':
                fields += v
            elif k == 'graph':
                fields.append(str(v))
            else:
                if not isinstance(v, str):
                    v = json.dumps(v)
                fields.append('# ::{} {}'.format(k, v))
        return '\n'.join(fields)

    def get_src_tokens(self):
        return self.lemmas if self.lemmas else self.sentence.split()


class AMRNode:
    attribute_priority = [
        'instance', 'quant', 'mode', 'value', 'name', 'li', 'mod', 'frequency',
        'month', 'day', 'year', 'time', 'unit', 'decade', 'poss'
    ]

    def __init__(self, identifier, attributes=None, copy_of=None):
        self.identifier = identifier
        if attributes is None:
            self.attributes = []
        else:
            self.attributes = attributes
            # self._sort_attributes()
        self._num_copies = 0
        self.copy_of = copy_of

    def _sort_attributes(self):
        def get_attr_priority(attr):
            if attr in self.attribute_priority:
                return self.attribute_priority.index(attr), attr
            if not re.search(r'^(ARG|op|snt)', attr):
                return len(self.attribute_priority), attr
            else:
                return len(self.attribute_priority) + 1, attr

        self.attributes.sort(key=lambda x: get_attr_priority(x[0]))

    def __hash__(self):
        return hash(self.identifier)

    def __eq__(self, other):
        if not isinstance(other, AMRNode):
            return False
        return self.identifier == other.identifier

    def __repr__(self):
        ret = str(self.identifier)
        for k, v in self.attributes:
            if k == 'instance':
                ret += ' / ' + v
                break
        return ret

    def __str__(self):
        ret = repr(self)
        for key, value in self.attributes:
            if key == 'instance':
                continue
            ret += '\n\t:{} {}'.format(key, value)
        return ret

    @property
    def instance(self):
        for key, value in self.attributes:
            if key == 'instance':
                return value
        else:
            return None

    @property
    def ops(self):
        ops = []
        for key, value in self.attributes:
            if re.search(r'op\d+', key):
                ops.append((int(key[2:]), value))
        if len(ops):
            ops.sort(key=lambda x: x[0])
        return [v for k, v in ops]

    def copy(self):
        attributes = None
        if self.attributes is not None:
            attributes = self.attributes[:]
        self._num_copies += 1
        copy = AMRNode(self.identifier + '_copy_{}'.format(self._num_copies), attributes, self)
        return copy

    def remove_attribute(self, attr, value):
        self.attributes.remove((attr, value))

    def add_attribute(self, attr, value):
        self.attributes.append((attr, value))

    def replace_attribute(self, attr, old, new):
        index = self.attributes.index((attr, old))
        self.attributes[index] = (attr, new)

    def get_frame_attributes(self):
        for k, v in self.attributes:
            if isinstance(v, str) and re.search(r'-\d\d$', v):
                yield k, v

    def get_senseless_attributes(self):
        for k, v in self.attributes:
            if isinstance(v, str) and not re.search(r'-\d\d$', v):
                yield k, v


class AMRGraph(penman.Graph):
    edge_label_priority = (
        'mod name time location degree poss domain quant manner unit purpose topic condition part-of compared-to '
        'duration source ord beneficiary concession direction frequency consist-of example medium location-of '
        'manner-of quant-of time-of instrument prep-in destination accompanier prep-with extent instrument-of age '
        'path concession-of subevent-of prep-as prep-to prep-against prep-on prep-for degree-of prep-under part '
        'condition-of prep-without topic-of season duration-of poss-of prep-from prep-at range purpose-of source-of '
        'subevent example-of value path-of scale conj-as-if prep-into prep-by prep-on-behalf-of medium-of prep-among '
        'calendar beneficiary-of prep-along-with extent-of age-of frequency-of dayperiod accompanier-of '
        'destination-of prep-amid prep-toward prep-in-addition-to ord-of name-of weekday direction-of prep-out-of '
        'timezone subset-of'.split())

    def __init__(self, penman_graph):
        super(AMRGraph, self).__init__()
        self._triples = penman_graph._triples
        self._top = penman_graph._top
        self._build_extras()
        self._src_tokens = []

    def __str__(self):
        self._triples = penman.alphanum_order(self._triples)
        return amr_codec.encode(self)

    def _build_extras(self):
        G = nx.DiGraph()

        self.variable_to_node = {}
        for v in self.variables():
            if type(v) is not str:
                continue
            attributes = [(t.relation, t.target) for t in self.attributes(source=v)]
            node = AMRNode(v, attributes)
            G.add_node(node)
            self.variable_to_node[v] = node

        edge_set = set()
        for edge in self.edges():
            if type(edge.source) is not str:
                continue
            source = self.variable_to_node[edge.source]
            target = self.variable_to_node[edge.target]
            relation = edge.relation

            if relation == 'instance':
                continue

            if source == target:
                continue

            if edge.inverted:
                source, target, relation = target, source, amr_codec.invert_relation(edge.relation)

            if (source, target) in edge_set:
                target = target.copy()

            edge_set.add((source, target))
            G.add_edge(source, target, label=relation)

        self._G = G

    def attributes(self, source=None, relation=None, target=None):
        # Refine attributes because there's a bug in penman.attributes()
        # See https://github.com/goodmami/penman/issues/29
        attrmatch = lambda a: (
                (source is None or source == a.source) and
                (relation is None or relation == a.relation) and
                (target is None or target == a.target)
        )
        variables = self.variables()
        attrs = [t for t in self.triples() if t.target not in variables or t.relation == 'instance']
        return list(filter(attrmatch, attrs))

    def _update_penman_graph(self, triples):
        self._triples = triples
        if self._top not in self.variables():
            self._top = None

    def is_name_node(self, node):
        edges = list(self._G.in_edges(node))
        return any(self._G[source][target].get('label', None) == 'name' for source, target in edges)

    def get_name_node_type(self, node):
        edges = list(self._G.in_edges(node))
        for source, target in edges:
            if self._G[source][target].get('label', None) == 'name':
                return source.instance
        raise KeyError

    def get_name_node_wiki(self, node):
        edges = list(self._G.in_edges(node))
        for source, target in edges:
            if self._G[source][target].get('label', None) == 'name':
                for attr, value in source.attributes:
                    if attr == 'wiki':
                        if value != '-':
                            value = value[1:-1]  # remove quotes
                        return value
        return None

    def set_name_node_wiki(self, node, wiki):
        edges = list(self._G.in_edges(node))
        parent = None
        for source, target in edges:
            if self._G[source][target].get('label', None) == 'name':
                parent = source
                break
        if parent:
            if wiki != '-':
                wiki = '"{}"'.format(wiki)
            self.add_node_attribute(parent, 'wiki', wiki)

    def is_date_node(self, node):
        return node.instance == 'date-entity'

    def add_edge(self, source, target, label):
        self._G.add_edge(source, target, label=label)
        t = penman.Triple(source=source.identifier, relation=label, target=target.identifier)
        triples = self._triples + [t]
        triples = penman.alphanum_order(triples)
        self._update_penman_graph(triples)

    def remove_edge(self, x, y):
        if isinstance(x, AMRNode) and isinstance(y, AMRNode):
            self._G.remove_edge(x, y)
        if isinstance(x, AMRNode):
            x = x.identifier
        if isinstance(y, AMRNode):
            y = y.identifier
        triples = [t for t in self._triples if not (t.source == x and t.target == y)]
        self._update_penman_graph(triples)

    def update_edge_label(self, x, y, old, new):
        self._G[x][y]['label'] = new
        triples = []
        for t in self._triples:
            if t.source == x.identifier and t.target == y.identifier and t.relation == old:
                t = Triple(x.identifier, new, y.identifier)
            triples.append(t)
        self._update_penman_graph(triples)

    def add_node(self, instance):
        identifier = instance[0]
        assert identifier.isalpha()
        if identifier in self.variables():
            i = 2
            while identifier + str(i) in self.variables():
                i += 1
            identifier += str(i)
        triples = self._triples + [Triple(identifier, 'instance', instance)]
        self._triples = penman.alphanum_order(triples)

        node = AMRNode(identifier, [('instance', instance)])
        self._G.add_node(node)
        return node

    def remove_node(self, node):
        self._G.remove_node(node)
        triples = [t for t in self._triples if t.source != node.identifier]
        self._update_penman_graph(triples)

    def replace_node_attribute(self, node, attr, old, new):
        node.replace_attribute(attr, old, new)
        triples = []
        found = False
        for t in self._triples:
            if t.source == node.identifier and t.relation == attr and t.target == old:
                found = True
                t = penman.Triple(source=node.identifier, relation=attr, target=new)
            triples.append(t)
        if not found:
            raise KeyError
        self._triples = penman.alphanum_order(triples)

    def remove_node_attribute(self, node, attr, value):
        node.remove_attribute(attr, value)
        triples = [t for t in self._triples if
                   not (t.source == node.identifier and t.relation == attr and t.target == value)]
        self._update_penman_graph(triples)

    def add_node_attribute(self, node, attr, value):
        node.add_attribute(attr, value)
        t = penman.Triple(source=node.identifier, relation=attr, target=value)
        self._triples = penman.alphanum_order(self._triples + [t])

    def remove_node_ops(self, node):
        ops = []
        for attr, value in node.attributes:
            if re.search(r'^op\d+$', attr):
                ops.append((attr, value))
        for attr, value in ops:
            self.remove_node_attribute(node, attr, value)

    def remove_subtree(self, root):
        children = []
        removed_nodes = set()
        for _, child in list(self._G.edges(root)):
            self.remove_edge(root, child)
            children.append(child)
        for child in children:
            if len(list(self._G.in_edges(child))) == 0:
                removed_nodes.update(self.remove_subtree(child))
        if len(list(self._G.in_edges(root))) == 0:
            self.remove_node(root)
            removed_nodes.add(root)
        return removed_nodes

    def get_subtree(self, root, max_depth):
        if max_depth == 0:
            return []
        nodes = [root]
        children = [child for _, child in self._G.edges(root)]
        nodes += children
        for child in children:
            if len(list(self._G.in_edges(child))) == 1:
                nodes = nodes + self.get_subtree(child, max_depth - 1)
        return nodes

    def get_nodes(self):
        return self._G.nodes

    def get_edges(self):
        return self._G.edges

    def set_src_tokens(self, sentence):
        if type(sentence) is not list:
            sentence = sentence.split(" ")
        self._src_tokens = sentence

    def get_src_tokens(self):
        return self._src_tokens

    def get_list_node(self, replace_copy=True):
        visited = defaultdict(int)
        node_list = []

        def dfs(node, relation, parent):

            node_list.append((
                node if node.copy_of is None or not replace_copy else node.copy_of,
                relation,
                parent if parent.copy_of is None or not replace_copy else parent.copy_of))

            if len(self._G[node]) > 0 and visited[node] == 0:
                visited[node] = 1
                for child_node, child_relation in self.sort_edges(self._G[node].items()):
                    dfs(child_node, child_relation["label"], node)

        dfs(
            self.variable_to_node[self._top],
            'root',
            self.variable_to_node[self._top]
        )

        return node_list

    def sort_edges(self, edges):
        return edges

    def get_tgt_tokens(self):
        node_list = self.get_list_node()

        tgt_token = []
        visited = defaultdict(int)

        for node, relation, parent_node in node_list:
            instance = [attr[1] for attr in node.attributes if attr[0] == "instance"]
            assert len(instance) == 1
            tgt_token.append(str(instance[0]))

            if len(node.attributes) > 1 and visited[node] == 0:
                for attr in node.attributes:
                    if attr[0] != "instance":
                        tgt_token.append(str(attr[1]))

            visited[node] = 1

        return tgt_token

    def get_list_data(self, amr, bos=None, eos=None, bert_tokenizer=None, max_tgt_length=None):
        node_list = self.get_list_node()

        tgt_tokens = []
        head_tags = []
        head_indices = []

        node_to_idx = defaultdict(list)
        visited = defaultdict(int)

        def update_info(node, relation, parent, token):
            head_indices.append(1 + node_to_idx[parent][-1])
            head_tags.append(relation)
            tgt_tokens.append(str(token))

        for node, relation, parent_node in node_list:

            node_to_idx[node].append(len(tgt_tokens))

            instance = [attr[1] for attr in node.attributes if attr[0] == "instance"]
            assert len(instance) == 1
            instance = instance[0]

            update_info(node, relation, parent_node, instance)

            if len(node.attributes) > 1 and visited[node] == 0:
                for attr in node.attributes:
                    if attr[0] != "instance":
                        update_info(node, attr[0], node, attr[1])

            visited[node] = 1

        def trim_very_long_tgt_tokens(tgt_tokens, head_tags, head_indices, node_to_idx):
            tgt_tokens = tgt_tokens[:max_tgt_length]
            head_tags = head_tags[:max_tgt_length]
            head_indices = head_indices[:max_tgt_length]
            for node, indices in node_to_idx.items():
                invalid_indices = [index for index in indices if index >= max_tgt_length]
                for index in invalid_indices:
                    indices.remove(index)
            return tgt_tokens, head_tags, head_indices, node_to_idx

        if max_tgt_length is not None:
            tgt_tokens, head_tags, head_indices, node_to_idx = trim_very_long_tgt_tokens(
                tgt_tokens, head_tags, head_indices, node_to_idx)

        copy_offset = 0
        if bos:
            tgt_tokens = [bos] + tgt_tokens
            copy_offset += 1
        if eos:
            tgt_tokens = tgt_tokens + [eos]

        head_indices[node_to_idx[self.variable_to_node[self.top]][0]] = 0

        # Target side Coreference
        tgt_copy_indices = [i for i in range(len(tgt_tokens))]

        for node, indices in node_to_idx.items():
            if len(indices) > 1:
                copy_idx = indices[0] + copy_offset
                for token_idx in indices[1:]:
                    tgt_copy_indices[token_idx + copy_offset] = copy_idx

        tgt_copy_map = [(token_idx, copy_idx) for token_idx, copy_idx in enumerate(tgt_copy_indices)]

        for i, copy_index in enumerate(tgt_copy_indices):
            # Set the coreferred target to 0 if no coref is available.
            if i == copy_index:
                tgt_copy_indices[i] = 0

        tgt_token_counter = Counter(tgt_tokens)
        tgt_copy_mask = [0] * len(tgt_tokens)
        for i, token in enumerate(tgt_tokens):
            if tgt_token_counter[token] > 1:
                tgt_copy_mask[i] = 1

        def add_source_side_tags_to_target_side(_src_tokens, _src_tags):
            assert len(_src_tags) == len(_src_tokens)
            tag_counter = defaultdict(lambda: defaultdict(int))
            for src_token, src_tag in zip(_src_tokens, _src_tags):
                tag_counter[src_token][src_tag] += 1

            tag_lut = {DEFAULT_OOV_TOKEN: DEFAULT_OOV_TOKEN,
                       DEFAULT_PADDING_TOKEN: DEFAULT_OOV_TOKEN}
            for src_token in set(_src_tokens):
                tag = max(tag_counter[src_token].keys(), key=lambda x: tag_counter[src_token][x])
                tag_lut[src_token] = tag

            tgt_tags = []
            for tgt_token in tgt_tokens:
                sim_token = find_similar_token(tgt_token, _src_tokens)
                if sim_token is not None:
                    index = _src_tokens.index(sim_token)
                    tag = _src_tags[index]
                else:
                    tag = DEFAULT_OOV_TOKEN
                tgt_tags.append(tag)

            return tgt_tags, tag_lut

        # Source Copy
        src_tokens = self.get_src_tokens()
        src_token_ids = None
        src_token_subword_index = None
        src_pos_tags = amr.pos_tags
        src_copy_vocab = SourceCopyVocabulary(src_tokens)
        src_copy_indices = src_copy_vocab.index_sequence(tgt_tokens)
        src_copy_map = src_copy_vocab.get_copy_map(src_tokens)
        tgt_pos_tags, pos_tag_lut = add_source_side_tags_to_target_side(src_tokens, src_pos_tags)

        if bert_tokenizer is not None:
            src_token_ids, src_token_subword_index = bert_tokenizer.tokenize(src_tokens, True)

        src_must_copy_tags = [1 if is_abstract_token(t) else 0 for t in src_tokens]
        src_copy_invalid_ids = set(src_copy_vocab.index_sequence(
            [t for t in src_tokens if is_english_punct(t)]))

        return {
            "tgt_tokens": tgt_tokens,
            "tgt_pos_tags": tgt_pos_tags,
            "tgt_copy_indices": tgt_copy_indices,
            "tgt_copy_map": tgt_copy_map,
            "tgt_copy_mask": tgt_copy_mask,
            "src_tokens": src_tokens,
            "src_token_ids": src_token_ids,
            "src_token_subword_index": src_token_subword_index,
            "src_must_copy_tags": src_must_copy_tags,
            "src_pos_tags": src_pos_tags,
            "src_copy_vocab": src_copy_vocab,
            "src_copy_indices": src_copy_indices,
            "src_copy_map": src_copy_map,
            "pos_tag_lut": pos_tag_lut,
            "head_tags": head_tags,
            "head_indices": head_indices,
            "src_copy_invalid_ids": src_copy_invalid_ids
        }

    @classmethod
    def decode(cls, raw_graph_string):
        _graph = amr_codec.decode(raw_graph_string)
        return cls(_graph)

    @classmethod
    def from_lists(cls, all_list):
        head_tags = all_list['head_tags']
        head_indices = all_list['head_indices']
        tgt_tokens = all_list['tokens']

        tgt_copy_indices = all_list['coref']
        variables = []
        variables_count = defaultdict(int)
        for i, token in enumerate(tgt_tokens):
            if tgt_copy_indices[i] != i:
                variables.append(variables[tgt_copy_indices[i]])
            else:
                if token[0] in variables_count:
                    variables.append(token[0] + str(variables_count[token[0]]))
                else:
                    variables.append(token[0])

                variables_count[token[0]] += 1

        Triples = []
        for variable, token in zip(variables, tgt_tokens):
            Triples.append(Triple(variable, "instance", token))
            Triples.append(
                Triple(
                    head_indices[variable],
                    head_tags[variable],
                    variable
                )
            )

    @classmethod
    def from_prediction(cls, prediction):

        def is_attribute_value(value):
            return re.search(r'(^".*"$|^[^a-zA-Z]+$)', value) is not None

        def is_attribute_edge(label):
            return label in ('instance', 'mode', 'li', 'value', 'month', 'year', 'day', 'decade', 'ARG6')

        def normalize_number(text):
            if re.search(r'^\d+,\d+$', text):
                text = text.replace(',', '')
            return text

        def abstract_node(value):
            return re.search(r'^([A-Z]+|DATE_ATTRS|SCORE_ENTITY|ORDINAL_ENTITY)_\d+$', value)

        def abstract_attribute(value):
            return re.search(r'^_QUANTITY_\d+$', value)

        def correct_multiroot(heads):
            for i in range(1, len(heads)):
                if heads[i] == 0:
                    heads[i] = 1
            return heads

        nodes = [normalize_number(n) for n in prediction['nodes']]
        heads = correct_multiroot(prediction['heads'])
        corefs = [int(x) for x in prediction['corefs']]
        head_labels = prediction['head_labels']

        triples = []
        top = None
        # Build the variable map from variable to instance.
        variable_map = {}
        for coref_index in corefs:
            node = nodes[coref_index - 1]
            head_label = head_labels[coref_index - 1]
            if (re.search(r'[/:\\()]', node) or is_attribute_value(node) or
                    is_attribute_edge(head_label) or abstract_attribute(node)):
                continue
            variable_map['vv{}'.format(coref_index)] = node
        for head_index in heads:
            if head_index == 0:
                continue
            node = nodes[head_index - 1]
            coref_index = corefs[head_index - 1]
            variable_map['vv{}'.format(coref_index)] = node
        # Build edge triples and other attribute triples.
        for i, head_index in enumerate(heads):
            if head_index == 0:
                top_variable = 'vv{}'.format(corefs[i])
                if top_variable not in variable_map:
                    variable_map[top_variable] = nodes[i]
                top = top_variable
                continue
            head_variable = 'vv{}'.format(corefs[head_index - 1])
            modifier = nodes[i]
            modifier_variable = 'vv{}'.format(corefs[i])
            label = head_labels[i]
            assert head_variable in variable_map
            if modifier_variable in variable_map:
                triples.append((head_variable, label, modifier_variable))
            else:
                # Add quotes if there's a backslash.
                if re.search(r'[/:\\()]', modifier) and not re.search(r'^".*"$', modifier):
                    modifier = '"{}"'.format(modifier)
                triples.append((head_variable, label, modifier))

        for var, node in variable_map.items():
            if re.search(r'^".*"$', node):
                node = node[1:-1]
            if re.search(r'[/:\\()]', node):
                parts = re.split(r'[/:\\()]', node)
                for part in parts[::-1]:
                    if len(part):
                        node = part
                        break
                else:
                    node = re.sub(r'[/:\\()]', '_', node)
            triples.append((var, 'instance', node))

        if len(triples) == 0:
            triples.append(('vv1', 'instance', 'string-entity'))
            top = 'vv1'
        triples.sort(key=lambda x: int(x[0].replace('vv', '')))
        graph = penman.Graph()
        graph._top = top
        graph._triples = [penman.Triple(*t) for t in triples]
        graph = cls(graph)
        try:
            GraphRepair.do(graph, nodes)
            amr_codec.encode(graph)
        except Exception as e:
            graph._top = top
            graph._triples = [penman.Triple(*t) for t in triples]
            graph = cls(graph)
        return graph


class SourceCopyVocabulary:
    def __init__(self, sentence, pad_token=DEFAULT_PADDING_TOKEN, unk_token=DEFAULT_OOV_TOKEN):
        if type(sentence) is not list:
            sentence = sentence.split(" ")

        self.src_tokens = sentence
        self.pad_token = pad_token
        self.unk_token = unk_token

        self.token_to_idx = {self.pad_token: 0, self.unk_token: 1}
        self.idx_to_token = {0: self.pad_token, 1: self.unk_token}

        self.vocab_size = 2

        for token in sentence:
            if token not in self.token_to_idx:
                self.token_to_idx[token] = self.vocab_size
                self.idx_to_token[self.vocab_size] = token
                self.vocab_size += 1

    def get_token_from_idx(self, idx):
        return self.idx_to_token[idx]

    def get_token_idx(self, token):
        return self.token_to_idx.get(token, self.token_to_idx[self.unk_token])

    def index_sequence(self, list_tokens):
        return [self.get_token_idx(token) for token in list_tokens]

    def get_copy_map(self, list_tokens):
        src_indices = [self.get_token_idx(self.unk_token)] + self.index_sequence(list_tokens)
        return [
            (src_idx, src_token_idx) for src_idx, src_token_idx in enumerate(src_indices)
        ]

    def get_special_tok_list(self):
        return [self.pad_token, self.unk_token]

    def __repr__(self):
        return json.dumps(self.idx_to_token)


def is_similar(instances1, instances2):
    if len(instances1) < len(instances2):
        small = instances1
        large = instances2
    else:
        small = instances2
        large = instances1
    coverage1 = sum(1 for x in small if x in large) / len(small)
    coverage2 = sum(1 for x in large if x in small) / len(large)
    return coverage1 > .8 and coverage2 > .8


class GraphRepair:

    def __init__(self, graph, nodes):
        self.graph = graph
        self.nodes = nodes
        self.repaired_items = set()

    @staticmethod
    def do(graph, nodes):
        gr = GraphRepair(graph, nodes)
        gr.remove_redundant_edges()
        gr.remove_unknown_nodes()

    def remove_unknown_nodes(self):
        graph = self.graph
        nodes = [node for node in graph.get_nodes()]
        for node in nodes:
            for attr, value in node.attributes:
                if value == '@@UNKNOWN@@' and attr != 'instance':
                    graph.remove_node_attribute(node, attr, value)
            if node.instance == '@@UNKNOWN@@':
                if len(list(graph._G.edges(node))) == 0:
                    for source, target in list(graph._G.in_edges(node)):
                        graph.remove_edge(source, target)
                    graph.remove_node(node)
                    self.repaired_items.add('remove-unknown-node')

    def remove_redundant_edges(self):
        """
        Edge labels such as ARGx, ARGx-of, and 'opx' should only appear at most once
        in each node's outgoing edges.
        """
        graph = self.graph
        nodes = [node for node in graph.get_nodes()]
        removed_nodes = set()
        for node in nodes:
            if node in removed_nodes:
                continue
            edges = list(graph._G.edges(node))
            edge_counter = defaultdict(list)
            for source, target in edges:
                label = graph._G[source][target]['label']
                # `name`, `ARGx`, and `ARGx-of` should only appear once.
                if label == 'name':  # or label.startswith('ARG'):
                    edge_counter[label].append(target)
                # the target of `opx' should only appear once.
                elif label.startswith('op') or label.startswith('snt'):
                    edge_counter[str(target.instance)].append(target)
                else:
                    edge_counter[label + str(target.instance)].append(target)
            for label, children in edge_counter.items():
                if len(children) == 1:
                    continue
                if label == 'name':
                    # remove redundant edges.
                    for target in children[1:]:
                        if len(list(graph._G.in_edges(target))) == 1 and len(list(graph._G.edges(target))) == 0:
                            graph.remove_edge(node, target)
                            graph.remove_node(target)
                            removed_nodes.add(target)
                            self.repaired_items.add('remove-redundant-edge')
                    continue
                visited_children = set()
                groups = []
                for i, target in enumerate(children):
                    if target in visited_children:
                        continue
                    subtree_instances1 = [n.instance for n in graph.get_subtree(target, 5)]
                    group = [(target, subtree_instances1)]
                    visited_children.add(target)
                    for _t in children[i + 1:]:
                        if _t in visited_children or target.instance != _t.instance:
                            continue
                        subtree_instances2 = [n.instance for n in graph.get_subtree(_t, 5)]
                        if is_similar(subtree_instances1, subtree_instances2):
                            group.append((_t, subtree_instances2))
                            visited_children.add(_t)
                    groups.append(group)
                for group in groups:
                    if len(group) == 1:
                        continue
                    kept_target, _ = max(group, key=lambda x: len(x[1]))
                    for target, _ in group:
                        if target == kept_target:
                            continue
                        graph.remove_edge(node, target)
                        removed_nodes.update(graph.remove_subtree(target))


================================================
FILE: plugins/hanlp_common/hanlp_common/configurable.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-16 22:24
from hanlp_common.reflection import str_to_type, classpath_of


class Configurable(object):
    @staticmethod
    def from_config(config: dict, **kwargs):
        """Build an object from config.

        Args:
          config: A ``dict`` holding parameters for its constructor. It has to contain a `classpath` key,
                    which has a classpath str as its value. ``classpath`` will determine the type of object
                    being deserialized.
          kwargs: Arguments not used.

        Returns: A deserialized object.

        """
        cls = config.get('classpath', None)
        assert cls, f'{config} doesn\'t contain classpath field'
        cls = str_to_type(cls)
        deserialized_config = dict(config)
        for k, v in config.items():
            if isinstance(v, dict) and 'classpath' in v:
                deserialized_config[k] = Configurable.from_config(v)
        if cls.from_config == Configurable.from_config:
            deserialized_config.pop('classpath')
            return cls(**deserialized_config)
        else:
            return cls.from_config(deserialized_config)


class AutoConfigurable(Configurable):
    @property
    def config(self) -> dict:
        """
        The config of this object, which are public properties. If any properties needs to be excluded from this config,
        simply declare it with prefix ``_``.
        """
        return dict([('classpath', classpath_of(self))] +
                    [(k, v.config if hasattr(v, 'config') else v)
                     for k, v in self.__dict__.items() if
                     not k.startswith('_')])

    def __repr__(self) -> str:
        return repr(self.config)


================================================
FILE: plugins/hanlp_common/hanlp_common/conll.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-19 20:50
from typing import Union, List

from hanlp_common.structure import SerializableDict
from hanlp_common.visualization import pretty_tree_horizontal, make_table, markdown_table


class CoNLLWord(SerializableDict):
    def __init__(self, id, form, lemma=None, cpos=None, pos=None, feats=None, head=None, deprel=None, phead=None,
                 pdeprel=None):
        """CoNLL (:cite:`buchholz-marsi-2006-conll`) format template, see http://anthology.aclweb.org/W/W06/W06-2920.pdf

        Args:
            id (int):
                Token counter, starting at 1 for each new sentence.
            form (str):
                Word form or punctuation symbol.
            lemma (str):
                Lemma or stem (depending on the particular treebank) of word form, or an underscore if not available.
            cpos (str):
                Coarse-grained part-of-speech tag, where the tagset depends on the treebank.
            pos (str):
                Fine-grained part-of-speech tag, where the tagset depends on the treebank.
            feats (str):
                Unordered set of syntactic and/or morphological features (depending on the particular treebank),
                or an underscore if not available.
            head (Union[int, List[int]]):
                Head of the current token, which is either a value of ID,
                or zero (’0’) if the token links to the virtual root node of the sentence.
            deprel (Union[str, List[str]]):
                Dependency relation to the HEAD.
            phead (int):
                Projective head of current token, which is either a value of ID or zero (’0’),
                or an underscore if not available.
            pdeprel (str):
                Dependency relation to the PHEAD, or an underscore if not available.
        """
        self.id = sanitize_conll_int_value(id)
        self.form = form
        self.cpos = cpos
        self.pos = pos
        self.head = sanitize_conll_int_value(head)
        self.deprel = deprel
        self.lemma = lemma
        self.feats = feats
        self.phead = phead
        self.pdeprel = pdeprel

    def __str__(self):
        if isinstance(self.head, list):
            return '\n'.join('\t'.join(['_' if v is None else v for v in values]) for values in [
                [str(self.id), self.form, self.lemma, self.cpos, self.pos, self.feats,
                 None if head is None else str(head), deprel, self.phead, self.pdeprel] for head, deprel in
                zip(self.head, self.deprel)
            ])
        values = [str(self.id), self.form, self.lemma, self.cpos, self.pos, self.feats,
                  None if self.head is None else str(self.head), self.deprel, self.phead, self.pdeprel]
        return '\t'.join(['_' if v is None else v for v in values])

    @property
    def nonempty_fields(self):
        """
        Get the values of nonempty fields as a list.
        """
        return list(f for f in
                    [self.form, self.lemma, self.cpos, self.pos, self.feats, self.head, self.deprel, self.phead,
                     self.pdeprel] if f)

    def get_pos(self):
        """
        Get the precisest pos for this word.

        Returns: ``self.pos`` or ``self.cpos``.

        """
        return self.pos or self.cpos


class CoNLLUWord(SerializableDict):
    def __init__(self, id: Union[int, str], form, lemma=None, upos=None, xpos=None, feats=None, head=None, deprel=None,
                 deps=None,
                 misc=None):
        """CoNLL-U format template, see https://universaldependencies.org/format.html

        Args:

            id (Union[int, str]):
                Token counter, starting at 1 for each new sentence.
            form (Union[str, None]):
                Word form or punctuation symbol.
            lemma (str):
                Lemma or stem (depending on the particular treebank) of word form, or an underscore if not available.
            upos (str):
                Universal part-of-speech tag.
            xpos (str):
                Language-specific part-of-speech tag; underscore if not available.
            feats (str):
                List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available.
            head (int):
                Head of the current token, which is either a value of ID,
                or zero (’0’) if the token links to the virtual root node of the sentence.
            deprel (str):
                Dependency relation to the HEAD.
            deps (Union[List[Tuple[int, str], str]):
                Projective head of current token, which is either a value of ID or zero (’0’),
                or an underscore if not available.
            misc (str):
                Dependency relation to the PHEAD, or an underscore if not available.
        """
        self.id = sanitize_conll_int_value(id)
        self.form = form
        self.upos = upos
        self.xpos = xpos
        if isinstance(head, list):
            assert deps is None, 'When head is a list, deps has to be None'
            assert isinstance(deprel, list), 'When head is a list, deprel has to be a list'
            assert len(deprel) == len(head), 'When head is a list, deprel has to match its length'
            deps = list(zip(head, deprel))
            head = None
            deprel = None
        self.head = sanitize_conll_int_value(head)
        self.deprel = deprel
        self.lemma = lemma
        self.feats = feats
        if deps == '_':
            deps = None
        if isinstance(deps, str):
            self.deps = []
            for pair in deps.split('|'):
                h, r = pair.split(':')
                h = int(h)
                self.deps.append((h, r))
        else:
            self.deps = deps
        self.misc = misc

    def __str__(self):
        deps = self.deps
        if not deps:
            deps = None
        else:
            deps = '|'.join(f'{h}:{r}' for h, r in deps)
        values = [str(self.id), self.form, self.lemma, self.upos, self.xpos, self.feats,
                  str(self.head) if self.head is not None else None, self.deprel, deps, self.misc]
        return '\t'.join(['_' if v is None else v for v in values])

    @property
    def nonempty_fields(self):
        """
        Get the values of nonempty fields as a list.
        """
        return list(f for f in
                    [self.form, self.lemma, self.upos, self.xpos, self.feats, self.head, self.deprel, self.deps,
                     self.misc] if f)

    def get_pos(self):
        """
        Get the precisest pos for this word.

        Returns: ``self.xpos`` or ``self.upos``

        """
        return self.xpos or self.upos


class CoNLLSentence(list):
    def __init__(self, words=None):
        """
        A list of :class:`~hanlp_common.conll.CoNLLWord` or :class:`~hanlp_common.conll.CoNLLUWord`. It is a sub-class
        of :class:`list` and its words can be accessed in the same way as accessing list elements.

        Args:
            words (list[Union[CoNLLWord, CoNLLUWord]]): A list of words.
        """
        super().__init__()
        if words:
            self.extend(words)

    def __str__(self):
        return '\n'.join([word.__str__() for word in self])

    @staticmethod
    def from_str(conll: str, conllu=False):
        """Build a CoNLLSentence from CoNLL-X format str

        Args:
          conll (str): CoNLL-X or CoNLL-U format string
          conllu:  ``True`` to build :class:`~hanlp_common.conll.CoNLLUWord` for each token.

        Returns:
            A :class:`~hanlp_common.conll.CoNLLSentence`.
        """
        words: List[CoNLLWord] = []
        prev_id = None
        for line in conll.strip().split('\n'):
            if line.startswith('#'):
                continue
            cells = line.split('\t')
            cells = [None if c == '_' else c for c in cells]
            if '-' in cells[0]:
                continue
            cells[0] = int(cells[0])
            cells[6] = int(cells[6])
            if cells[0] != prev_id:
                words.append(CoNLLUWord(*cells) if conllu else CoNLLWord(*cells))
            else:
                if isinstance(words[-1].head, list):
                    words[-1].head.append(cells[6])
                    words[-1].deprel.append(cells[7])
                else:
                    words[-1].head = [words[-1].head] + [cells[6]]
                    words[-1].deprel = [words[-1].deprel] + [cells[7]]
            prev_id = cells[0]
        if conllu:
            for word in words:  # type: CoNLLUWord
                if isinstance(word.head, list):
                    assert not word.deps
                    word.deps = list(zip(word.head, word.deprel))
                    word.head = None
                    word.deprel = None
        return CoNLLSentence(words)

    @staticmethod
    def from_file(path: str, conllu=False):
        """Build a CoNLLSentence from ``.conllx`` or ``.conllu`` file

        Args:
          path: Path to the file.
          conllu:  ``True`` to build :class:`~hanlp_common.conll.CoNLLUWord` for each token.

        Returns:
            A :class:`~hanlp_common.conll.CoNLLSentence`.
        """
        with open(path) as src:
            return [CoNLLSentence.from_str(x, conllu) for x in src.read().split('\n\n') if x.strip()]

    @staticmethod
    def from_dict(d: dict, conllu=False):
        """Build a CoNLLSentence from a dict.

        Args:
            d: A dict storing a list for each field, where each index corresponds to a token.
            conllu: ``True`` to build :class:`~hanlp_common.conll.CoNLLUWord` for each token.

        Returns:
            A :class:`~hanlp_common.conll.CoNLLSentence`.
        """
        if conllu:
            headings = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
        else:
            headings = ['ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL']
        words: List[Union[CoNLLWord, CoNLLUWord]] = []
        for cells in zip(*list(d[f] for f in headings)):
            words.append(CoNLLUWord(*cells) if conllu else CoNLLWord(*cells))
        return CoNLLSentence(words)

    def to_markdown(self, headings: Union[str, List[str]] = 'auto') -> str:
        r"""Convert into markdown string.

        Args:
            headings: ``auto`` to automatically detect the word type. When passed a list of string, they are treated as
                        headings for each field.

        Returns:
            A markdown representation of this sentence.
        """
        cells = [str(word).split('\t') for word in self]
        if headings == 'auto':
            if isinstance(self[0], CoNLLWord):
                headings = ['ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL']
            else:  # conllu
                headings = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
                for each in cells:
                    # if '|' in each[8]:
                    # each[8] = f'`{each[8]}`'
                    each[8] = each[8].replace('|', '⎮')
        alignment = [('^', '>'), ('^', '<'), ('^', '<'), ('^', '<'), ('^', '<'), ('^', '<'), ('^', '>'), ('^', '<'),
                     ('^', '<'), ('^', '<')]
        text = markdown_table(headings, cells, alignment=alignment)
        return text

    def to_tree(self, extras: List[str] = None) -> str:
        """Convert into a pretty tree string which can be printed to show the tree structure.

        Args:
            extras: Extra table to be aligned to this tree.

        Returns:
            A pretty tree string along with extra table if passed any.
        """
        arrows = []
        for word in self:  # type: Union[CoNLLWord, CoNLLUWord]
            if word.head:
                arrows.append({'from': word.head - 1, 'to': word.id - 1})
        tree = pretty_tree_horizontal(arrows)
        rows = [['Dep Tree', 'Token', 'Relation']]
        has_lem = all(x.lemma for x in self)
        has_pos = all(x.get_pos() for x in self)
        if has_lem:
            rows[0].append('Lemma')
        if has_pos:
            rows[0].append('PoS')
        if extras:
            rows[0].extend(extras[0])
        for i, (word, arc) in enumerate(zip(self, tree)):
            cell_per_word = [arc]
            cell_per_word.append(word.form)
            cell_per_word.append(word.deprel)
            if has_lem:
                cell_per_word.append(word.lemma)
            if has_pos:
                cell_per_word.append(word.get_pos())
            if extras:
                cell_per_word.extend(extras[i + 1])
            rows.append(cell_per_word)
        return make_table(rows, insert_header=True)

    @property
    def projective(self):
        """
        ``True`` if this tree is projective.
        """
        return isprojective([x.head for x in self])


class CoNLLSentenceList(list):

    def __str__(self) -> str:
        return '\n\n'.join(str(x) for x in self)


def sanitize_conll_int_value(value: Union[str, int]):
    if value is None or isinstance(value, int):
        return value
    if value == '_':
        return None
    if isinstance(value, str):
        return int(value)
    return value


def isprojective(sequence):
    r"""
    Checks if a dependency tree is projective.
    This also works for partial annotation.

    Besides the obvious crossing arcs, the examples below illustrate two non-projective cases
    which are hard to detect in the scenario of partial annotation.

    Args:
        sequence (list[int]):
            A list of head indices.

    Returns:
        ``True`` if the tree is projective, ``False`` otherwise.

    Examples:
        >>> isprojective([2, -1, 1])  # -1 denotes un-annotated cases
        False
        >>> isprojective([3, -1, 2])
        False
    """

    pairs = [(h, d) for d, h in enumerate(sequence, 1) if h >= 0]
    for i, (hi, di) in enumerate(pairs):
        for hj, dj in pairs[i + 1:]:
            (li, ri), (lj, rj) = sorted([hi, di]), sorted([hj, dj])
            if li <= hj <= ri and hi == dj:
                return False
            if lj <= hi <= rj and hj == di:
                return False
            if (li < lj < ri or li < rj < ri) and (li - lj) * (ri - rj) > 0:
                return False
    return True


================================================
FILE: plugins/hanlp_common/hanlp_common/constant.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-06-13 22:41
import os

PAD = '<pad>'
'''Padding token.'''
UNK = '<unk>'
'''Unknown token.'''
CLS = '[CLS]'
BOS = '<bos>'
EOS = '<eos>'
ROOT = BOS
IDX = '_idx_'
'''Key for index.'''
HANLP_URL = os.getenv('HANLP_URL', 'https://file.hankcs.com/hanlp/')
'''Resource URL.'''
HANLP_VERBOSE = os.environ.get('HANLP_VERBOSE', '1').lower() in ('1', 'true', 'yes')
'''Enable verbose or not.'''
NULL = '<null>'
PRED = 'PRED'

IPYTHON = os.environ.get('HANLP_IPYTHON', '1').lower() in ('1', 'true', 'yes')  # Allow the user to disable IPYTHON
if IPYTHON:
    try:
        # noinspection PyUnresolvedReferences,PyStatementEffect
        get_ipython
    except NameError:
        IPYTHON = False


================================================
FILE: plugins/hanlp_common/hanlp_common/document.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-31 04:16
import json
import re
import warnings
from typing import List, Union

from phrasetree.tree import Tree

from hanlp_common.conll import CoNLLUWord, CoNLLSentence, CoNLLSentenceList
from hanlp_common.constant import PRED, IPYTHON
from hanlp_common.util import collapse_json, prefix_match
from hanlp_common.visualization import tree_to_list, list_to_tree, render_labeled_span, make_table


class Document(dict):
    def __init__(self, *args, **kwargs) -> None:
        r"""A dict structure holding parsed annotations. A document is a subclass of ``dict`` and it supports every
        interface of ``dict``\. Additionally, it supports interfaces to deal with various linguistic structures. Its
        ``str`` and ``dict`` representations are made to be compatible with JSON serialization.

        Args:
            *args: An iterator of key-value pairs.
            **kwargs: Arguments from ``**`` operator.

        Examples::

            # Create a document
            doc = Document(
                tok=[["晓美焰", "来到", "北京", "立方庭", "参观", "自然", "语义", "科技", "公司"]],
                pos=[["NR", "VV", "NR", "NR", "VV", "NN", "NN", "NN", "NN"]],
                ner=[[["晓美焰", "PERSON", 0, 1], ["北京立方庭", "LOCATION", 2, 4],
                      ["自然语义科技公司", "ORGANIZATION", 5, 9]]],
                dep=[[[2, "nsubj"], [0, "root"], [4, "name"], [2, "dobj"], [2, "conj"],
                      [9, "compound"], [9, "compound"], [9, "compound"], [5, "dobj"]]]
            )

            # print(doc) or str(doc) to get its JSON representation
            print(doc)

            # Access an annotation by its task name
            print(doc['tok'])

            # Get number of sentences
            print(f'It has {doc.count_sentences()} sentence(s)')

            # Access the n-th sentence
            print(doc.squeeze(0)['tok'])

            # Pretty print it right in your console or notebook
            doc.pretty_print()

            # To save the pretty prints in a str
            pretty_text: str = '\n\n'.join(doc.to_pretty())

        """
        super().__init__(*args, **kwargs)
        for k, v in list(self.items()):
            if not v:
                continue
            if k == 'con':
                if isinstance(v, Tree) or isinstance(v[0], Tree):
                    continue
                flat = isinstance(v[0], str)
                if flat:
                    v = [v]
                ls = []
                for each in v:
                    if not isinstance(each, Tree):
                        ls.append(list_to_tree(each))
                if flat:
                    ls = ls[0]
                self[k] = ls
            elif k == 'amr':
                from hanlp_common.amr import AMRGraph
                import penman
                if isinstance(v, AMRGraph) or isinstance(v[0], AMRGraph):
                    continue
                flat = isinstance(v[0][0], str)
                if flat:
                    v = [v]
                graphs = [AMRGraph(penman.Graph(triples)) for triples in v]
                if flat:
                    graphs = graphs[0]
                self[k] = graphs

    def to_json(self, ensure_ascii=False, indent=2) -> str:
        """Convert to json string.

        Args:
            ensure_ascii: ``False`` to allow for non-ascii text.
            indent: Indent per nested structure.

        Returns:
            A text representation in ``str``.

        """
        d = self.to_dict()
        text = json.dumps(d, ensure_ascii=ensure_ascii, indent=indent, default=lambda o: repr(o))
        text = collapse_json(text, 4)
        return text

    def to_dict(self):
        """Convert to a json compatible dict.

        Returns:
            A dict representation.
        """
        d = dict(self)
        for k, v in self.items():
            if v == [] or v is None:
                continue
            if k == 'con':
                if not isinstance(v, Tree) and not isinstance(v[0], Tree):
                    continue
                flat = isinstance(v, Tree)
                if flat:
                    v = [v]
                ls = []
                for each in v:
                    if isinstance(each, Tree):
                        ls.append(tree_to_list(each))
                if flat:
                    ls = ls[0]
                d[k] = ls
        return d

    def __str__(self) -> str:
        return self.to_json()

    def to_conll(self, tok='tok', lem='lem', pos='pos', fea='fea', dep='dep', sdp='sdp') -> Union[
        CoNLLSentence, List[CoNLLSentence]]:
        """
        Convert to :class:`~hanlp_common.conll.CoNLLSentence`.

        Args:
            tok (str): Field name for tok.
            lem (str): Field name for lem.
            pos (str): Field name for upos.
            fea (str): Field name for feats.
            dep (str): Field name for dependency parsing.
            sdp (str): Field name for semantic dependency parsing.

        Returns:
            A :class:`~hanlp_common.conll.CoNLLSentence` representation.

        """
        tok = prefix_match(tok, self)
        lem = prefix_match(lem, self)
        pos = prefix_match(pos, self)
        fea = prefix_match(fea, self)
        dep = prefix_match(dep, self)
        sdp = prefix_match(sdp, self)
        results = CoNLLSentenceList()
        if not tok or not self[tok]:
            return results
        self = self._to_doc_without_spans(tok)
        flat = isinstance(self[tok][0], str)
        if flat:
            d = Document((k, [v]) for k, v in self.items())
        else:
            d = self
        for sample in [dict(zip(d, t)) for t in zip(*d.values())]:
            def get(_k, _i):
                _v = sample.get(_k, None)
                if not _v:
                    return None
                return _v[_i]

            sent = CoNLLSentence()

            for i, _tok in enumerate(sample[tok]):
                _dep = get(dep, i)
                if not _dep:
                    _dep = (None, None)
                sent.append(
                    CoNLLUWord(i + 1, form=_tok, lemma=get(lem, i), upos=get(pos, i), feats=get(fea, i), head=_dep[0],
                               deprel=_dep[1],
                               deps=None if not get(sdp, i) else '|'.join(f'{x[0]}:{x[1]}' for x in get(sdp, i))))
            results.append(sent)
        if flat:
            return results[0]
        return results

    def to_pretty(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl', con='con',
                  show_header=True, html=False) -> Union[str, List[str]]:
        """
        Convert to a pretty text representation which can be printed to visualize linguistic structures.

        Args:
            tok: Token key.
            lem: Lemma key.
            pos: Part-of-speech key.
            dep: Dependency parse tree key.
            sdp: Semantic dependency tree/graph key. SDP visualization has not been implemented yet.
            ner: Named entity key.
            srl: Semantic role labeling key.
            con: Constituency parsing key.
            show_header: ``True`` to include a header which indicates each field with its name.
            html: ``True`` to output HTML format so that non-ASCII characters can align correctly.

        Returns:
            A pretty string.

        """
        results = []
        tok = prefix_match(tok, self)
        pos = prefix_match(pos, self)
        ner = prefix_match(ner, self)
        conlls = self.to_conll(tok=tok, lem=lem, pos=pos, dep=dep, sdp=sdp)
        flat = isinstance(conlls, CoNLLSentence)
        if flat:
            conlls: List[CoNLLSentence] = [conlls]

        def condense(block_, extras_=None):
            text_ = make_table(block_, insert_header=False)
            text_ = [x.split('\t', 1) for x in text_.split('\n')]
            text_ = [[x[0], x[1].replace('\t', '')] for x in text_]
            if extras_:
                for r, s in zip(extras_, text_):
                    r.extend(s)
            return text_

        for i, conll in enumerate(conlls):
            conll: CoNLLSentence = conll
            tokens = [x.form for x in conll]
            length = len(conll)
            extras = [[] for j in range(length + 1)]
            if ner in self:
                ner_samples = self[ner]
                if flat:
                    ner_samples = [ner_samples]
                ner_per_sample = ner_samples[i]
                # For nested NER, use the longest span
                start_offsets = [None for i in range(length)]
                for ent, label, b, e in ner_per_sample:
                    if not start_offsets[b] or e > start_offsets[b][-1]:
                        start_offsets[b] = (ent, label, b, e)
                ner_per_sample = [y for y in start_offsets if y]
                header = ['Token', 'NER', 'Type']
                block = [[] for _ in range(length + 1)]
                _ner = []
                _type = []
                offset = 0
                for ent, label, b, e in ner_per_sample:
                    render_labeled_span(b, e, _ner, _type, label, offset)
                    offset = e
                if offset != length:
                    _ner.extend([''] * (length - offset))
                    _type.extend([''] * (length - offset))
                if any(_type):
                    block[0].extend(header)
                    for j, (_s, _t) in enumerate(zip(_ner, _type)):
                        block[j + 1].extend((tokens[j], _s, _t))
                    text = condense(block, extras)

            if srl in self:
                srl_samples = self[srl]
                if flat:
                    srl_samples = [srl_samples]
                srl_per_sample = srl_samples[i]
                for k, pas in enumerate(srl_per_sample):
                    if not pas:
                        continue
                    block = [[] for _ in range(length + 1)]
                    header = ['Token', 'SRL', f'PA{k + 1}']
                    _srl = []
                    _type = []
                    offset = 0
                    p_index = None
                    for _, label, b, e in pas:
                        render_labeled_span(b, e, _srl, _type, label, offset)
                        offset = e
                        if label == PRED:
                            p_index = b
                    if len(_srl) != length:
                        _srl.extend([''] * (length - offset))
                        _type.extend([''] * (length - offset))
                    if p_index is not None:
                        _srl[p_index] = '╟──►'
                        # _type[j] = 'V'
                        if len(block) != len(_srl) + 1:
                            # warnings.warn(f'Unable to visualize overlapped spans: {pas}')
                            continue
                        block[0].extend(header)
                        while len(_srl) < length:
                            _srl.append('')
                        while len(_type) < length:
                            _type.append('')
                        for j, (_s, _t) in enumerate(zip(_srl, _type)):
                            block[j + 1].extend((tokens[j], _s, _t))
                    text = condense(block, extras)
            if con in self:
                con_samples: Tree = self[con]
                if flat:
                    con_samples: List[Tree] = [con_samples]
                tree = con_samples[i]
                block = [[] for _ in range(length + 1)]
                block[0].extend(('Token', 'PoS'))
                for j, t in enumerate(tree.pos()):
                    block[j + 1].extend(t)

                for height in range(2, tree.height() + (0 if len(tree) == 1 else 1)):
                    offset = 0
                    spans = []
                    labels = []
                    for k, subtree in enumerate(tree.subtrees(lambda x: x.height() == height)):
                        subtree: Tree = subtree
                        b, e = offset, offset + len(subtree.leaves())
                        if height >= 3:
                            b, e = subtree[0].center, subtree[-1].center + 1
                        subtree.center = b + (e - b) // 2
                        render_labeled_span(b, e, spans, labels, subtree.label(), offset, unidirectional=True)
                        offset = e
                    if len(spans) != length:
                        spans.extend([''] * (length - len(spans)))
                    if len(labels) != length:
                        labels.extend([''] * (length - len(labels)))
                    if height < 3:
                        continue
                    block[0].extend(['', f'{height}'])
                    for j, (_s, _t) in enumerate(zip(spans, labels)):
                        block[j + 1].extend((_s, _t))
                    # check short arrows and increase their length
                    for j, arrow in enumerate(spans):
                        if not arrow:
                            # -1 current tag ; -2 arrow to current tag ; -3 = prev tag ; -4 = arrow to prev tag
                            if block[j + 1][-3] or block[j + 1][-4] == '───►':
                                if height > 3:
                                    if block[j + 1][-3]:
                                        block[j + 1][-1] = block[j + 1][-3]
                                        block[j + 1][-2] = '───►'
                                    else:
                                        block[j + 1][-1] = '────'
                                        block[j + 1][-2] = '────'
                                    block[j + 1][-3] = '────'
                                    if block[j + 1][-4] == '───►':
                                        block[j + 1][-4] = '────'
                                else:
                                    block[j + 1][-1] = '────'
                                if block[j + 1][-1] == '────':
                                    block[j + 1][-2] = '────'
                                if not block[j + 1][-4]:
                                    block[j + 1][-4] = '────'
                # If the root label is shorter than the level number, extend it to the same length
                level_len = len(block[0][-1])
                for row in block[1:]:
                    if row[-1] and len(row[-1]) < level_len:
                        row[-1] = row[-1] + ' ' * (level_len - len(row[-1]))

                text = condense(block)
                # Cosmetic issues
                for row in text[1:]:
                    while '  ─' in row[1]:
                        row[1] = row[1].replace('  ─', ' ──')
                    row[1] = row[1].replace('─ ─', '───')
                    row[1] = re.sub(r'([►─])([\w-]*)(\s+)([│├])', lambda
                        m: f'{m.group(1)}{m.group(2)}{"─" * len(m.group(3))}{"┤" if m.group(4) == "│" else "┼"}',
                                    row[1])
                    row[1] = re.sub(r'►(─+)►', r'─\1►', row[1])
                for r, s in zip(extras, text):
                    r.extend(s)
            # warnings.warn('Unable to visualize non-projective trees.')
            if dep in self and conll.projective:
                text = conll.to_tree(extras)
                if not show_header:
                    text = text.split('\n')
                    text = '\n'.join(text[2:])
                results.append(text)
            elif any(extras):
                results.append(make_table(extras, insert_header=True))
            else:
                results.append(' '.join(['/'.join(str(f) for f in x.nonempty_fields) for x in conll]))
        if html:
            def to_html(pretty_text: str) -> str:
                lines = [x for x in pretty_text.split('\n') if x]
                cells = []
                for line in lines:
                    cells.append(line.split('\t'))

                num_cols = len(cells[0])
                cols = []

                for i in range(num_cols):
                    cols.append([])
                    for row in cells:
                        cols[-1].append(row[i])

                html = '<div style="display: table; padding-bottom: 1rem;">'
                for i, each in enumerate(cols):
                    html += '<pre style="display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,' \
                            'Liberation Mono,Courier New,monospace; white-space: nowrap; line-height: 128%; padding: 0;">'
                    if i != len(cols) - 1:
                        each = [x + ' ' for x in each]
                    html += '<br>'.join([x.replace(' ', '&nbsp;') for x in each])
                    html += '</pre>'
                html += '</div>'
                return html

            results = [to_html(x) for x in results]
        if flat:
            return results[0]
        return results

    def pretty_print(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl', con='con',
                     show_header=True, html=IPYTHON):
        """
        Print a pretty text representation which visualizes linguistic structures.

        Args:
            tok: Token key.
            lem: Lemma key.
            pos: Part-of-speech key.
            dep: Dependency parse tree key.
            sdp: Semantic dependency tree/graph key. SDP visualization has not been implemented yet.
            ner: Named entity key.
            srl: Semantic role labeling key.
            con: Constituency parsing key.
            show_header: ``True`` to print a header which indicates each field with its name.
            html: ``True`` to output HTML format so that non-ASCII characters can align correctly.

        """
        results = self.to_pretty(tok, lem, pos, dep, sdp, ner, srl, con, show_header, html=html)
        if isinstance(results, str):
            results = [results]
        if html and IPYTHON:
            from IPython.core.display import display, HTML
            display(HTML('<br>'.join(results)))
        else:
            sent_new_line = '\n\n' if any('\n' in x for x in results) else '\n'
            print(sent_new_line.join(results))

    def translate(self, lang, tok='tok', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl'):
        """
        Translate tags for each annotation. This is an inplace operation.

        .. Attention:: Note that the translated document might not print well in terminal due to non-ASCII characters.

        Args:
            lang: Target language to be translated to.
            tok: Token key.
            pos: Part-of-speech key.
            dep: Dependency parse tree key.
            sdp: Semantic dependency tree/graph key. SDP visualization has not been implemented yet.
            ner: Named entity key.
            srl: Semantic role labeling key.

        Returns:
            The translated document.

        """
        if lang == 'zh':
            from hanlp.utils.lang.zh import localization
        else:
            raise NotImplementedError(f'No translation for {lang}. '
                                      f'Please contribute to our translation at https://github.com/hankcs/HanLP')
        flat = isinstance(self[tok][0], str)
        for task, name in zip(['pos', 'ner', 'dep', 'sdp', 'srl'], [pos, ner, dep, sdp, srl]):
            annotations = self.get(name, None)
            if not annotations:
                continue
            if flat:
                annotations = [annotations]
            translate: dict = getattr(localization, name, None)
            if not translate:
                continue
            for anno_per_sent in annotations:
                for i, v in enumerate(anno_per_sent):
                    if task == 'ner' or task == 'dep':
                        v[1] = translate.get(v[1], v[1])
                    else:
                        anno_per_sent[i] = translate.get(v, v)
        return self

    def squeeze(self, i=0):
        r"""
        Squeeze the dimension of each field into one. It's intended to convert a nested document like ``[[sent_i]]``
        to ``[sent_i]``. When there are multiple sentences, only the ``i-th`` one will be returned. Note this is not an
        inplace operation.

        Args:
            i: Keep the element at ``index`` for all ``list``\s.

        Returns:
            A squeezed document with only one sentence.

        """
        sq = Document()
        for k, v in self.items():
            sq[k] = v[i] if isinstance(v, list) else v
        return sq

    def _to_doc_without_spans(self, tok: str):
        """
        Remove the spans attached to tokens and return a new document.

        Args:
            tok: The key to tokens.

        Returns:
            A new document or itself.

        """
        tokens: Union[List[str], List[List[str]], List[str, int, int],
                      List[List[str, int, int]]] = self[tok]
        if isinstance(tokens[0], str):
            return self
        elif isinstance(tokens[0][-1], int):
            tokens = [x[0] for x in tokens]
        elif isinstance(tokens[0][-1], str):
            return self
        else:
            tokens = [[t[0] for t in x] for x in tokens]
        d = Document(**self)
        d[tok] = tokens
        return d

    def get_by_prefix(self, prefix: str):
        """
        Get value by the prefix of a key.

        Args:
            prefix: The prefix of a key. If multiple keys are matched, only the first one will be used.

        Returns:
            The value assigned with the matched key.
        """
        key = prefix_match(prefix, self)
        if not key:
            return None
        return self[key]

    def count_sentences(self) -> int:
        """
        Count number of sentences in this document.

        Returns:
            Number of sentences.
        """
        tok = self.get_by_prefix('tok')
        if isinstance(tok[0], str):
            return 1
        return len(tok)


================================================
FILE: plugins/hanlp_common/hanlp_common/io.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-16 22:38
import json
import os
import pickle
import sys
from typing import Union


def save_pickle(item, path):
    with open(path, 'wb') as f:
        pickle.dump(item, f)


def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)


def save_json(item: Union[dict, list, str, int, float], path: str, ensure_ascii=False, cls=None,
              default=lambda o: repr(o), indent=2):
    dirname = os.path.dirname(path)
    if dirname:
        os.makedirs(dirname, exist_ok=True)
    with open(path, 'w', encoding='utf-8') as out:
        json.dump(item, out, ensure_ascii=ensure_ascii, indent=indent, cls=cls, default=default)


def load_json(path):
    with open(path, encoding='utf-8') as src:
        return json.load(src)


def filename_is_json(filename):
    filename, file_extension = os.path.splitext(filename)
    return file_extension in ['.json', '.jsonl']


def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


================================================
FILE: plugins/hanlp_common/hanlp_common/reflection.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 16:41
import importlib
import inspect


def classpath_of(obj) -> str:
    """get the full class path of object

    Args:
      obj: return:

    Returns:

    """
    if inspect.isfunction(obj):
        return module_path_of(obj)
    return "{0}.{1}".format(obj.__class__.__module__, obj.__class__.__name__)


def module_path_of(func) -> str:
    return inspect.getmodule(func).__name__ + '.' + func.__name__


def object_from_classpath(classpath, **kwargs):
    classpath = str_to_type(classpath)
    if inspect.isfunction(classpath):
        return classpath
    return classpath(**kwargs)


def str_to_type(classpath):
    """convert class path in str format to a type

    Args:
      classpath: class path

    Returns:
      type

    """
    module_name, class_name = classpath.rsplit(".", 1)
    cls = getattr(importlib.import_module(module_name), class_name)
    return cls


def type_to_str(type_object) -> str:
    """convert a type object to class path in str format

    Args:
      type_object: type

    Returns:
      class path

    """
    cls_name = str(type_object)
    assert cls_name.startswith("<class '"), 'illegal input'
    cls_name = cls_name[len("<class '"):]
    assert cls_name.endswith("'>"), 'illegal input'
    cls_name = cls_name[:-len("'>")]
    return cls_name


================================================
FILE: plugins/hanlp_common/hanlp_common/structure.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-19 20:56
import json
from collections import OrderedDict

from hanlp_common.io import filename_is_json, save_pickle, load_pickle, save_json, load_json


class Serializable(object):
    """A super class for save/load operations."""

    def save(self, path, fmt=None):
        if not fmt:
            if filename_is_json(path):
                self.save_json(path)
            else:
                self.save_pickle(path)
        elif fmt in ['json', 'jsonl']:
            self.save_json(path)
        else:
            self.save_pickle(path)

    def load(self, path, fmt=None):
        if not fmt:
            if filename_is_json(path):
                self.load_json(path)
            else:
                self.load_pickle(path)
        elif fmt in ['json', 'jsonl']:
            self.load_json(path)
        else:
            self.load_pickle(path)

    def save_pickle(self, path):
        """Save to path

        Args:
          path:

        Returns:


        """
        save_pickle(self, path)

    def load_pickle(self, path):
        """Load from path

        Args:
          path(str): file path

        Returns:


        """
        item = load_pickle(path)
        return self.copy_from(item)

    def save_json(self, path):
        save_json(self.to_dict(), path)

    def load_json(self, path):
        item = load_json(path)
        return self.copy_from(item)

    # @abstractmethod
    def copy_from(self, item):
        self.__dict__ = item.__dict__
        # raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3]))

    def to_json(self, ensure_ascii=False, indent=2, sort=False) -> str:
        d = self.to_dict()
        if sort:
            d = OrderedDict(sorted(d.items()))
        return json.dumps(d, ensure_ascii=ensure_ascii, indent=indent, default=lambda o: repr(o))

    def to_dict(self) -> dict:
        return self.__dict__


class SerializableDict(Serializable, dict):

    def save_json(self, path):
        save_json(self, path)

    def copy_from(self, item):
        if isinstance(item, dict):
            self.clear()
            self.update(item)

    def __getattr__(self, key):
        if key.startswith('__'):
            return dict.__getattr__(key)
        return self.__getitem__(key)

    def __setattr__(self, key, value):
        return self.__setitem__(key, value)

    def to_dict(self) -> dict:
        return self

================================================
FILE: plugins/hanlp_common/hanlp_common/util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-27 19:09
import math
from typing import Union, Any, List, Optional, Tuple, Iterable, Dict
import inspect
from itertools import chain, combinations


def powerset(iterable, descending=False):
    """
    powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)

    Args:
        iterable:

    Returns:

    """
    s = list(iterable)
    sizes = range(len(s), -1, -1) if descending else range(len(s) + 1)
    return chain.from_iterable(combinations(s, r) for r in sizes)


def isdebugging():
    """See Also https://stackoverflow.com/questions/333995/how-to-detect-that-python-code-is-being-executed-through-the-debugger"""
    for frame in inspect.stack():
        if frame[1].endswith("pydevd.py"):
            return True
    return False


def list_is_list_of_lists(sent: Union[Any, List[Any]]) -> Optional[bool]:
    if not sent:
        return None
    return isinstance(sent[0], list)


def set_tuple_with(t: Tuple, v, at=0) -> Tuple:
    t = list(t)
    t[at] = v
    return tuple(t)


def consume_keys_from_dict(keys: Iterable, d: dict) -> dict:
    consumed = {}
    for k in keys:
        if k in d:
            consumed[k] = d.pop(k)
    return consumed


def merge_dict(d: dict, overwrite=False, inplace=False, **kwargs):
    """Merging the provided dict with other kvs

    Args:
      d: 
      kwargs: 
      d: dict: 
      overwrite:  (Default value = False)
      inplace:  (Default value = False)
      **kwargs: 

    Returns:

    
    """
    nd = dict([(k, v) for k, v in d.items()] + [(k, v) for k, v in kwargs.items() if overwrite or k not in d])
    if inplace:
        d.update(nd)
        return d
    return nd


def merge_locals_kwargs(locals: dict, kwargs: dict = None, excludes=('self', 'kwargs', '__class__')):
    if not kwargs:
        kwargs = dict()
    return merge_dict(dict((k, v) for k, v in list(locals.items())
                           if k not in excludes), **kwargs)


def infer_space_after(sent: List[str]):
    last_token = None
    quote_count: int = 0
    # infer whitespace after field
    whitespace_after = [True] * len(sent)
    for token in range(len(sent)):
        if sent[token] == '"':
            quote_count += 1
            if quote_count % 2 != 0:
                whitespace_after[token] = False
            elif last_token is not None:
                whitespace_after[last_token] = False

        if last_token is not None:

            if sent[token] in [".", ":", ",", ";", ")", "n't", "!", "?"]:
                whitespace_after[last_token] = False

            if sent[token].startswith("'"):
                whitespace_after[last_token] = False

        if sent[token] in ["("]:
            whitespace_after[token] = False

        last_token = token
    return whitespace_after


def collapse_json(text, indent=12):
    """Compacts a string of json data by collapsing whitespace after the
    specified indent level
    
    NOTE: will not produce correct results when indent level is not a multiple
    of the json indent level

    Args:
      text: 
      indent:  (Default value = 12)

    Returns:

    """
    initial = " " * indent
    out = []  # final json output
    sublevel = []  # accumulation list for sublevel entries
    pending = None  # holder for consecutive entries at exact indent level
    for line in text.splitlines():
        if line.startswith(initial):
            if line[indent] == " ":
                # found a line indented further than the indent level, so add
                # it to the sublevel list
                if pending:
                    # the first item in the sublevel will be the pending item
                    # that was the previous line in the json
                    sublevel.append(pending)
                    pending = None
                item = line.strip()
                sublevel.append(item)
                if item.endswith(","):
                    sublevel.append(" ")
            elif sublevel:
                # found a line at the exact indent level *and* we have sublevel
                # items. This means the sublevel items have come to an end
                sublevel.append(line.strip())
                out.append("".join(sublevel))
                sublevel = []
            else:
                # found a line at the exact indent level but no items indented
                # further, so possibly start a new sub-level
                if pending:
                    # if there is already a pending item, it means that
                    # consecutive entries in the json had the exact same
                    # indentation and that last pending item was not the start
                    # of a new sublevel.
                    out.append(pending)
                pending = line.rstrip()
        else:
            if pending:
                # it's possible that an item will be pending but not added to
                # the output yet, so make sure it's not forgotten.
                out.append(pending)
                pending = None
            if sublevel:
                out.append("".join(sublevel))
            out.append(line)
    return "\n".join(out)


class DummyContext(object):
    def __enter__(self):
        pass

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass


def merge_list_of_dict(samples: List[Dict]) -> dict:
    batch = {}
    for each in samples:
        for k, v in each.items():
            vs = batch.get(k, None)
            if vs is None:
                vs = []
                batch[k] = vs
            vs.append(v)
    return batch


def split_dict(batch: Dict[str, Any]) -> List[Dict[str, Any]]:
    samples = []
    batch = dict((k, v) for k, v in batch.items() if isinstance(v, list))
    num_samples = len(max(batch.values(), key=len))
    for i in range(num_samples):
        samples.append(dict((k, v[i]) for k, v in batch.items()))
    return samples


def reorder(samples: List, order: List[int]) -> List:
    return [samples[i] for i in sorted(range(len(order)), key=lambda k: order[k])]


def k_fold(k, total, i):
    trn = math.ceil(i / k * total)
    tst = math.ceil((i + 1) / k * total)
    return list(range(0, trn)) + list(range(tst, total)), list(range(trn, tst))


def dfs(graph, start):
    seen = set()
    path = []
    q = [start]
    while q:
        v = q.pop()
        if v not in seen:
            seen.add(v)
            path.append(v)
            q.extend(graph[v])

    return path


def topological_sort(graph, start):
    seen = set()
    stack = []
    order = []
    q = [start]
    while q:
        v = q.pop()
        if v not in seen:
            seen.add(v)
            q.extend(graph[v])

            while stack and v not in graph[stack[-1]]:
                order.append(stack.pop())
            stack.append(v)

    return stack + order[::-1]


def prefix_match(target, sources: Iterable[str]):
    if target is None:
        return None
    if target in sources:
        return target
    for each in sources:
        if each.startswith(target):
            return each


================================================
FILE: plugins/hanlp_common/hanlp_common/visualization.py
================================================
# -*- coding:utf-8 -*-
# Modified from https://github.com/tylerneylon/explacy
import io
from collections import defaultdict
from pprint import pprint

from phrasetree.tree import Tree


def make_table(rows, insert_header=False):
    col_widths = [max(len(s) for s in col) for col in zip(*rows[1:])]
    rows[0] = [x[:l] for x, l in zip(rows[0], col_widths)]
    fmt = '\t'.join('%%-%ds' % width for width in col_widths)
    if insert_header:
        rows.insert(1, ['─' * width for width in col_widths])
    return '\n'.join(fmt % tuple(row) for row in rows)


def _start_end(arrow):
    start, end = arrow['from'], arrow['to']
    mn = min(start, end)
    mx = max(start, end)
    return start, end, mn, mx


def pretty_tree_horizontal(arrows, _do_print_debug_info=False):
    """Print the dependency tree horizontally

    Args:
      arrows: 
      _do_print_debug_info:  (Default value = False)

    Returns:

    """
    # Set the base height; these may increase to allow room for arrowheads after this.
    arrows_with_deps = defaultdict(set)
    for i, arrow in enumerate(arrows):
        arrow['underset'] = set()
        if _do_print_debug_info:
            print('Arrow %d: "%s" -> "%s"' % (i, arrow['from'], arrow['to']))
        num_deps = 0
        start, end, mn, mx = _start_end(arrow)
        for j, other in enumerate(arrows):
            if arrow is other:
                continue
            o_start, o_end, o_mn, o_mx = _start_end(other)
            if ((start == o_start and mn <= o_end <= mx) or
                    (start != o_start and mn <= o_start <= mx)):
                num_deps += 1
                if _do_print_debug_info:
                    print('%d is over %d' % (i, j))
                arrow['underset'].add(j)
        arrow['num_deps_left'] = arrow['num_deps'] = num_deps
        arrows_with_deps[num_deps].add(i)

    if _do_print_debug_info:
        print('')
        print('arrows:')
        pprint(arrows)

        print('')
        print('arrows_with_deps:')
        pprint(arrows_with_deps)

    # Render the arrows in characters. Some heights will be raised to make room for arrowheads.
    sent_len = (max([max(arrow['from'], arrow['to']) for arrow in arrows]) if arrows else 0) + 1
    lines = [[] for i in range(sent_len)]
    num_arrows_left = len(arrows)
    while num_arrows_left > 0:

        assert len(arrows_with_deps[0])

        arrow_index = arrows_with_deps[0].pop()
        arrow = arrows[arrow_index]
        src, dst, mn, mx = _start_end(arrow)

        # Check the height needed.
        height = 3
        if arrow['underset']:
            height = max(arrows[i]['height'] for i in arrow['underset']) + 1
        height = max(height, 3, len(lines[dst]) + 3)
        arrow['height'] = height

        if _do_print_debug_info:
            print('')
            print('Rendering arrow %d: "%s" -> "%s"' % (arrow_index,
                                                        arrow['from'],
                                                        arrow['to']))
            print('  height = %d' % height)

        goes_up = src > dst

        # Draw the outgoing src line.
        if lines[src] and len(lines[src]) < height:
            lines[src][-1].add('w')
        while len(lines[src]) < height - 1:
            lines[src].append(set(['e', 'w']))
        if len(lines[src]) < height:
            lines[src].append({'e'})
        lines[src][height - 1].add('n' if goes_up else 's')

        # Draw the incoming dst line.
        lines[dst].append(u'►')
        while len(lines[dst]) < height:
            lines[dst].append(set(['e', 'w']))
        lines[dst][-1] = set(['e', 's']) if goes_up else set(['e', 'n'])

        # Draw the adjoining vertical line.
        for i in range(mn + 1, mx):
            while len(lines[i]) < height - 1:
                lines[i].append(' ')
            lines[i].append(set(['n', 's']))

        # Update arrows_with_deps.
        for arr_i, arr in enumerate(arrows):
            if arrow_index in arr['underset']:
                arrows_with_deps[arr['num_deps_left']].remove(arr_i)
                arr['num_deps_left'] -= 1
                arrows_with_deps[arr['num_deps_left']].add(arr_i)

        num_arrows_left -= 1

    return render_arrows(lines)


def render_arrows(lines):
    arr_chars = {'ew': u'─',
                 'ns': u'│',
                 'en': u'└',
                 'es': u'┌',
                 'enw': u'┴',
                 'ensw': u'┼',
                 'ens': u'├',
                 'esw': u'┬'}
    # Convert the character lists into strings.
    max_len = max(len(line) for line in lines)
    for i in range(len(lines)):
        lines[i] = [arr_chars[''.join(sorted(ch))] if type(ch) is set else ch for ch in lines[i]]
        lines[i] = ''.join(reversed(lines[i]))
        lines[i] = ' ' * (max_len - len(lines[i])) + lines[i]
    return lines


def render_span(begin, end, unidirectional=False):
    if end - begin == 1:
        return ['───►']
    elif end - begin == 2:
        return [
            '──┐',
            '──┴►',
        ] if unidirectional else [
            '◄─┐',
            '◄─┴►',
        ]

    rows = []
    for i in range(begin, end):
        if i == (end - begin) // 2 + begin:
            rows.append('  ├►')
        elif i == begin:
            rows.append('──┐' if unidirectional else '◄─┐')
        elif i == end - 1:
            rows.append('──┘' if unidirectional else '◄─┘')
        else:
            rows.append('  │')
    return rows


def tree_to_list(T):
    return [T.label(), [tree_to_list(t) if isinstance(t, Tree) else t for t in T]]


def list_to_tree(L):
    if isinstance(L, str):
        return L
    return Tree(L[0], [list_to_tree(child) for child in L[1]])


def render_labeled_span(b, e, spans, labels, label, offset, unidirectional=False):
    spans.extend([''] * (b - offset))
    spans.extend(render_span(b, e, unidirectional))
    center = b + (e - b) // 2
    labels.extend([''] * (center - offset))
    labels.append(label)
    labels.extend([''] * (e - center - 1))


def main():
    # arrows = [{'from': 1, 'to': 0}, {'from': 2, 'to': 1}, {'from': 2, 'to': 4}, {'from': 2, 'to': 5},
    #           {'from': 4, 'to': 3}]
    # lines = pretty_tree_horizontal(arrows)
    # print('\n'.join(lines))
    # print('\n'.join([
    #     '◄─┐',
    #     '  │',
    #     '  ├►',
    #     '  │',
    #     '◄─┘',
    # ]))
    print('\n'.join(render_span(7, 12)))


if __name__ == '__main__':
    main()
left_rule = {'<': ':', '^': ':', '>': '-'}
right_rule = {'<': '-', '^': ':', '>': ':'}


def evalute_field(record, field_spec):
    """Evalute a field of a record using the type of the field_spec as a guide.

    Args:
      record:
      field_spec:

    Returns:

    """
    if type(field_spec) is int:
        return str(record[field_spec])
    elif type(field_spec) is str:
        return str(getattr(record, field_spec))
    else:
        return str(field_spec(record))


def markdown_table(headings, records, fields=None, alignment=None, file=None):
    """Generate a Doxygen-flavor Markdown table from records.
    See https://stackoverflow.com/questions/13394140/generate-markdown-tables

    file -- Any object with a 'write' method that takes a single string
        parameter.
    records -- Iterable.  Rows will be generated from this.
    fields -- List of fields for each row.  Each entry may be an integer,
        string or a function.  If the entry is an integer, it is assumed to be
        an index of each record.  If the entry is a string, it is assumed to be
        a field of each record.  If the entry is a function, it is called with
        the record and its return value is taken as the value of the field.
    headings -- List of column headings.
    alignment - List of pairs alignment characters.  The first of the pair
        specifies the alignment of the header, (Doxygen won't respect this, but
        it might look good, the second specifies the alignment of the cells in
        the column.

        Possible alignment characters are:
            '<' = Left align
            '>' = Right align (default for cells)
            '^' = Center (default for column headings)

    Args:
      headings:
      records:
      fields:  (Default value = None)
      alignment:  (Default value = None)
      file:  (Default value = None)

    Returns:

    """
    if not file:
        file = io.StringIO()
    num_columns = len(headings)
    if not fields:
        fields = list(range(num_columns))
    assert len(headings) == num_columns

    # Compute the table cell data
    columns = [[] for i in range(num_columns)]
    for record in records:
        for i, field in enumerate(fields):
            columns[i].append(evalute_field(record, field))

    # Fill out any missing alignment characters.
    extended_align = alignment if alignment is not None else [('^', '<')]
    if len(extended_align) > num_columns:
        extended_align = extended_align[0:num_columns]
    elif len(extended_align) < num_columns:
        extended_align += [('^', '>') for i in range(num_columns - len(extended_align))]

    heading_align, cell_align = [x for x in zip(*extended_align)]

    field_widths = [len(max(column, key=len)) if len(column) > 0 else 0
                    for column in columns]
    heading_widths = [max(len(head), 2) for head in headings]
    column_widths = [max(x) for x in zip(field_widths, heading_widths)]

    _ = ' | '.join(['{:' + a + str(w) + '}'
                    for a, w in zip(heading_align, column_widths)])
    heading_template = '| ' + _ + ' |'
    _ = ' | '.join(['{:' + a + str(w) + '}'
                    for a, w in zip(cell_align, column_widths)])
    row_template = '| ' + _ + ' |'

    _ = ' | '.join([left_rule[a] + '-' * (w - 2) + right_rule[a]
                    for a, w in zip(cell_align, column_widths)])
    ruling = '| ' + _ + ' |'

    file.write(heading_template.format(*headings).rstrip() + '\n')
    file.write(ruling.rstrip() + '\n')
    for row in zip(*columns):
        file.write(row_template.format(*row).rstrip() + '\n')
    if isinstance(file, io.StringIO):
        text = file.getvalue()
        file.close()
        return text


================================================
FILE: plugins/hanlp_common/setup.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 19:26
from os.path import abspath, join, dirname
from setuptools import find_packages, setup

this_dir = abspath(dirname(__file__))
with open(join(this_dir, 'README.md'), encoding='utf-8') as file:
    long_description = file.read()

setup(
    name='hanlp_common',
    version='0.0.22',
    description='HanLP: Han Language Processing',
    long_description=long_description,
    long_description_content_type="text/markdown",
    url='https://github.com/hankcs/HanLP',
    author='hankcs',
    author_email='hankcshe@gmail.com',
    license='Apache License 2.0',
    classifiers=[
        'Intended Audience :: Science/Research',
        'Intended Audience :: Developers',
        "Development Status :: 3 - Alpha",
        'Operating System :: OS Independent',
        "License :: OSI Approved :: Apache Software License",
        'Programming Language :: Python :: 3 :: Only',
        'Topic :: Scientific/Engineering :: Artificial Intelligence',
        "Topic :: Text Processing :: Linguistic"
    ],
    keywords='corpus,machine-learning,NLU,NLP',
    packages=find_packages(exclude=['docs', 'tests*']),
    include_package_data=True,
    install_requires=[
        'phrasetree>=0.0.9',
    ],
    extras_require={
        # These AMR dependencies might not be necessary for most people.
        'full': [
            'networkx',
            'penman==0.6.2',
        ],
    },
    python_requires='>=3.6',
)


================================================
FILE: plugins/hanlp_demo/README.md
================================================
# Demos and examples for HanLP

This package is intended for demonstration purpose and won't be released to pypi. **Training requires a fair understanding of Linux and Python which might not be the case for everybody.**

You need a Linux/macOS system with Internet on because some corpora and bash scripts will be downloaded during training. Training on Windows might work if you are an expert but we believe it's very rare.

Your `python` command needs to be Python2 while `python3` needs to be Python3.

You need to install this package and run it from the **root** folder of HanLP.

```bash
pip install -e plugins/hanlp_demo
python3 plugins/hanlp_demo/hanlp_demo/zh/train/open_small.py
```


================================================
FILE: plugins/hanlp_demo/hanlp_demo/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-29 17:48


================================================
FILE: plugins/hanlp_demo/hanlp_demo/block_windows.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-07-28 21:38
from hanlp.utils.io_util import windows

assert not windows(), 'Windows is not supported for this script. Please run it on Linux systems.'


================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-01 17:55

================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_amr.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-01-25 19:09
import hanlp

amr_parser = hanlp.load(hanlp.pretrained.amr.AMR3_SEQ2SEQ_BART_LARGE)
amr = amr_parser('The boy wants the girl to believe him.')
print(amr)


================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_dep.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-01 17:55
import hanlp

syntactic_parser = hanlp.load(hanlp.pretrained.dep.PTB_BIAFFINE_DEP_EN)
sent = [('Is', 'VBZ'),
        ('this', 'DT'),
        ('the', 'DT'),
        ('future', 'NN'),
        ('of', 'IN'),
        ('chamber', 'NN'),
        ('music', 'NN'),
        ('?', '.')]
tree = syntactic_parser(sent)
print(tree)


================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_lm.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-02-11 09:14
import hanlp

lm = hanlp.load(hanlp.pretrained.rnnlm.FLAIR_LM_FW_WMT11_EN_TF)
print(''.join(lm.generate_text(list('hello'))))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-03 22:50
import hanlp

recognizer = hanlp.load(hanlp.pretrained.ner.CONLL03_NER_BERT_BASE_CASED_EN)
print(recognizer(["President", "Obama", "is", "speaking", "at", "the", "White", "House", "."]))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_pipeline.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-04 21:05
import hanlp
from hanlp.utils.lang.en.english_tokenizer import tokenize_english

tokenizer = tokenize_english
tagger = hanlp.load(hanlp.pretrained.pos.PTB_POS_RNN_FASTTEXT_EN)
syntactic_parser = hanlp.load(hanlp.pretrained.dep.PTB_BIAFFINE_DEP_EN)
semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_PAS_BIAFFINE_EN)

pipeline = hanlp.pipeline() \
    .append(hanlp.utils.rules.split_sentence, output_key='sentences') \
    .append(tokenizer, output_key='tokens') \
    .append(tagger, output_key='part_of_speech_tags') \
    .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies',
            conll=False) \
    .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies',
            conll=False)
print(pipeline)

text = '''Jobs and Wozniak co-founded Apple in 1976 to sell Wozniak's Apple I personal computer.
Together the duo gained fame and wealth a year later with the Apple II.
'''

doc = pipeline(text)
print(doc)

# You can save the config to disk for deploying or sharing.
pipeline.save('en.json')
# Then load it smoothly.
deployed = hanlp.load('en.json')
print(deployed)
print(deployed(text))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_pos.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-03 22:16
import hanlp

tagger = hanlp.load(hanlp.pretrained.pos.PTB_POS_RNN_FASTTEXT_EN)
print(tagger([['I', 'banked', '2', 'dollars', 'in', 'a', 'bank', '.'],
              ['Is', 'this', 'the', 'future', 'of', 'chamber', 'music', '?']]))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_sdp.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-03 15:26
import hanlp
from hanlp_common.conll import CoNLLSentence

# semeval15 offers three independent annotations over the Penn Treebank (PTB)
semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_PAS_BIAFFINE_EN)
# semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_DM_BIAFFINE_EN)
# semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_PSD_BIAFFINE_EN)
sent = [('Is', 'VBZ'),
        ('this', 'DT'),
        ('the', 'DT'),
        ('future', 'NN'),
        ('of', 'IN'),
        ('chamber', 'NN'),
        ('music', 'NN'),
        ('?', '.')]
tree = semantic_parser(sent)  # type:CoNLLSentence
print(tree)


================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_sentiment_analysis.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-01 03:52
import hanlp

classifier = hanlp.load('SST2_ALBERT_BASE_EN')
print(classifier.predict('I feel lucky'))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_tok.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-02 19:41
from hanlp.utils.lang.en.english_tokenizer import tokenize_english

text = """\
Don't go gentle into that good night.
"""
print(tokenize_english(text))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/train_sst2_albert_base.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-11-10 17:41
import os

from hanlp.components.classifiers.transformer_classifier_tf import TransformerClassifierTF

from tests import cdroot

from hanlp.datasets.glu.glue import STANFORD_SENTIMENT_TREEBANK_2_DEV, STANFORD_SENTIMENT_TREEBANK_2_TRAIN, \
    STANFORD_SENTIMENT_TREEBANK_2_TEST

cdroot()
save_dir = os.path.join('data', 'model', 'sst', 'sst2_albert_base')
classifier = TransformerClassifierTF()
classifier.fit(STANFORD_SENTIMENT_TREEBANK_2_TRAIN, STANFORD_SENTIMENT_TREEBANK_2_DEV, save_dir,
               transformer='albert-base-v2')
classifier.load(save_dir)
print(classifier('it\' s a charming and often affecting journey'))
classifier.evaluate(STANFORD_SENTIMENT_TREEBANK_2_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/ja/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-17 22:30


================================================
FILE: plugins/hanlp_demo/hanlp_demo/ja/demo_mtl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-17 22:30
import hanlp
from hanlp_common.document import Document

HanLP = hanlp.load(hanlp.pretrained.mtl.NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA)
doc: Document = HanLP([
    '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
    '奈須きのこは1973年11月28日に千葉県円空山で生まれ、ゲーム制作会社「ノーツ」の設立者だ。',
])
print(doc)
doc.pretty_print()


================================================
FILE: plugins/hanlp_demo/hanlp_demo/mul/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-31 22:25


================================================
FILE: plugins/hanlp_demo/hanlp_demo/mul/demo_lid.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-09-28 16:49
import hanlp

lid = hanlp.load(hanlp.pretrained.classifiers.LID_176_FASTTEXT_BASE)

print(lid('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.'))
lang, prob = lid('2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True)
print(f'{lang} language identified with probability {prob:.3%}')
print(lid('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2))

# For a combination of languages, predict top-k languages with probabilities:
text = '''
2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。
In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.
'''

print(lid(text, topk=3, prob=True))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/mul/demo_lid_restful.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-09-28 16:49
from hanlp_restful import HanLPClient

HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None, language='mul')

print(HanLP.language_identification([
    'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.',
    '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
    '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。',
]))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/mul/demo_mtl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-31 13:51
import hanlp
from hanlp_common.document import Document

HanLP = hanlp.load(hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE)
doc: Document = HanLP([
    'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.',
    '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
    '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。',
])
print(doc)
doc.pretty_print()


================================================
FILE: plugins/hanlp_demo/hanlp_demo/mul/train/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2023-02-21 19:40


================================================
FILE: plugins/hanlp_demo/hanlp_demo/mul/train/mul_base.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-03 14:24
from hanlp.common.dataset import SortingSamplerBuilder
from hanlp.common.transform import NormalizeToken
from hanlp.components.mtl.multi_task_learning import MultiTaskLearning
from hanlp.components.mtl.tasks.tok.tag_tok import TaggingTokenization
from hanlp.components.mtl.tasks.ud import UniversalDependenciesParsing
from hanlp.datasets.parsing.ptb import PTB_TOKEN_MAPPING
from hanlp.datasets.parsing.ud.ud210m import UD_210_MULTILINGUAL_TRAIN, UD_210_MULTILINGUAL_DEV, \
    UD_210_MULTILINGUAL_TEST
from hanlp.layers.embeddings.contextual_word_embedding import ContextualWordEmbedding
from hanlp.utils.log_util import cprint
from tests import cdroot


def main():
    cdroot()
    transformer = "nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large"
    tasks = {
        'tok': TaggingTokenization(
            'data/mtl/mul/tok/train.tsv',
            'data/mtl/mul/tok/dev.tsv',
            'data/mtl/mul/tok/test.tsv',
            SortingSamplerBuilder(batch_size=128, batch_max_tokens=12800),
            hard_constraint=True,
            tagging_scheme='BMES',
            delimiter='\t',
            max_seq_len=256,
            char_level=True,
            lr=1e-3,
        ),
        'ud': UniversalDependenciesParsing(
            UD_210_MULTILINGUAL_TRAIN,
            UD_210_MULTILINGUAL_DEV,
            UD_210_MULTILINGUAL_TEST,
            SortingSamplerBuilder(batch_size=128, batch_max_tokens=12800),
            lr=1e-3,
            dependencies='tok',
            max_seq_len=256,
        ),
    }
    mtl = MultiTaskLearning()
    save_dir = 'data/model/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mMiniLMv2L12'
    cprint(f'Model will be saved in [cyan]{save_dir}[/cyan]')
    mtl.fit(
        ContextualWordEmbedding(
            'token',
            transformer,
            average_subwords=True,
            max_sequence_length=512,
            word_dropout=.2,
        ),
        tasks,
        save_dir,
        30,
        lr=1e-3,
        encoder_lr=5e-5,
        grad_norm=1,
        gradient_accumulation=8,
        eval_trn=False,
        transform=NormalizeToken(PTB_TOKEN_MAPPING, 'token'),
        tau=0.5,
        cache='data/cache/ud/mtl',
    )
    cprint(f'Model saved in [cyan]{save_dir}[/cyan]')
    mtl.load(save_dir)
    mtl['tok'].dict_force = {"'s", "n't", "'ll", "'m", "'d", "'ve", "'re"}
    mtl['ud'].config.tree = True
    mtl.save_config(save_dir)
    for k, v in mtl.tasks.items():
        v.trn = tasks[k].trn
        v.dev = tasks[k].dev
        v.tst = tasks[k].tst
    mtl.evaluate(save_dir)
    doc = mtl(['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.',
               '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
               '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'])
    doc.pretty_print()


if __name__ == '__main__':
    main()


================================================
FILE: plugins/hanlp_demo/hanlp_demo/sent_split.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-31 14:23
import hanlp

split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)
output = split_sent('3.14 is pi. “你好！！！”——他说。劇場版「Fate/stay night [HF]」最終章公開カウントダウン！')
print('\n'.join(output))
# See also https://hanlp.hankcs.com/docs/api/hanlp/components/eos.html


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-31 13:51


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/abstractive_summarization_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/absum_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fabsum_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "4M7ka0K5OMWU",
    "outputId": "d74f0749-0587-454a-d7c9-7418d45ce534"
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "BMW528wGNulM"
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 生成式自动摘要\n",
    "生成式自动摘要（Abstractive Summarization）任务的目标是为文章生成一段简短的概括性摘要。 生成的摘要有可能出现原文中不存在的新短语或新句子，并且整体流畅性较高。\n",
    "### 中文\n",
    "生成式自动摘要任务的输入为一段文本："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "936d439a-e1ff-4308-d2aa-775955558594"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'长江证券：看好大金属品种中的铜铝钢'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.abstractive_summarization('''\n",
    "每经AI快讯，2月4日，长江证券研究所金属行业首席分析师王鹤涛表示，2023年海外经济衰退，美债现处于历史高位，\n",
    "黄金的趋势是值得关注的；在国内需求修复的过程中，看好大金属品种中的铜铝钢。\n",
    "此外，在细分的小品种里，建议关注两条主线，一是新能源，比如锂、钴、镍、稀土，二是专精特新主线。（央视财经）\n",
    "''')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "jj1Jk-2sPHYx"
   },
   "source": [
    "返回值为一段摘要。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 英文\n",
    "按照HanLP一贯的多语种设计，任何语言都支持。由于服务器GPU资源限制，目前英文接口暂未上线。如果你有相应需求，欢迎前往论坛发起请愿。"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "absum_restful.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}

================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/amr_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/amr_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Famr_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "4M7ka0K5OMWU",
    "outputId": "d74f0749-0587-454a-d7c9-7418d45ce534"
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "BMW528wGNulM"
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 抽象意义表示\n",
    "### 中文\n",
    "抽象意义表示任务的输入为一段文本或已分词完毕的句子："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "936d439a-e1ff-4308-d2aa-775955558594"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "graphs = HanLP.abstract_meaning_representation('男孩希望女孩相信他。')\n",
    "len(graphs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "jj1Jk-2sPHYx"
   },
   "source": [
    "返回值为每个句子相应的AMR图的Meaning Representation格式："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': '0',\n",
       " 'input': '男孩 希望 女孩 相信 他 。',\n",
       " 'nodes': [{'id': 0,\n",
       "   'label': '男孩',\n",
       "   'anchors': [{'from': 0, 'to': 2}, {'from': 12, 'to': 13}]},\n",
       "  {'id': 1, 'label': '希望-01', 'anchors': [{'from': 3, 'to': 5}]},\n",
       "  {'id': 2, 'label': '女孩', 'anchors': [{'from': 6, 'to': 8}]},\n",
       "  {'id': 3, 'label': '相信-01', 'anchors': [{'from': 9, 'to': 11}]}],\n",
       " 'edges': [{'source': 1, 'target': 3, 'label': 'arg1'},\n",
       "  {'source': 1, 'target': 0, 'label': 'arg0'},\n",
       "  {'source': 3, 'target': 2, 'label': 'arg0'},\n",
       "  {'source': 3, 'target': 0, 'label': 'arg1'}],\n",
       " 'tops': [1],\n",
       " 'framework': 'amr'}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "graph = graphs[0]\n",
    "graph"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "注意上面“男孩”有2个anchor，分别对应“男孩”和“他”。也就是说，MR格式其实包含了指代消解的结果。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 可视化\n",
    "指定`visualization='svg'`即可得到矢量图可视化。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" width=\"171pt\" height=\"298pt\" viewBox=\"0.00 0.00 170.78 297.55\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 293.55)\">\n",
       "<title>0</title>\n",
       "<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-293.55 166.78,-293.55 166.78,4 -4,4\"/>\n",
       "<!-- top -->\n",
       "<!-- 1 -->\n",
       "<g id=\"node2\" class=\"node\">\n",
       "<title>1</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" cx=\"59.53\" cy=\"-197.46\" rx=\"45.01\" ry=\"19.18\"/>\n",
       "<text text-anchor=\"start\" x=\"37.53\" y=\"-193.26\" font-family=\"Times,serif\" font-size=\"14.00\">希望-01</text>\n",
       "</g>\n",
       "<!-- top&#45;&gt;1 -->\n",
       "<g id=\"edge1\" class=\"edge\">\n",
       "<title>top-&gt;1</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M59.53,-253.47C59.53,-245.51 59.53,-235.82 59.53,-226.81\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"63.03,-226.68 59.53,-216.68 56.03,-226.68 63.03,-226.68\"/>\n",
       "</g>\n",
       "<!-- 0 -->\n",
       "<g id=\"node3\" class=\"node\">\n",
       "<title>0</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" cx=\"32.53\" cy=\"-19.09\" rx=\"32.55\" ry=\"19.18\"/>\n",
       "<text text-anchor=\"start\" x=\"19.53\" y=\"-14.89\" font-family=\"Times,serif\" font-size=\"14.00\">男孩</text>\n",
       "</g>\n",
       "<!-- 1&#45;&gt;0 -->\n",
       "<g id=\"edge2\" class=\"edge\">\n",
       "<title>1-&gt;0</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M52.8,-178.56C47.93,-164.78 41.68,-145.14 38.53,-127.37 33.85,-100.97 32.53,-70.4 32.28,-48.67\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"35.78,-48.45 32.23,-38.47 28.78,-48.48 35.78,-48.45\"/>\n",
       "<text text-anchor=\"middle\" x=\"51.03\" y=\"-104.58\" font-family=\"Times,serif\" font-size=\"14.00\">arg0</text>\n",
       "</g>\n",
       "<!-- 3 -->\n",
       "<g id=\"node4\" class=\"node\">\n",
       "<title>3</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" cx=\"117.53\" cy=\"-108.28\" rx=\"45.01\" ry=\"19.18\"/>\n",
       "<text text-anchor=\"start\" x=\"95.53\" y=\"-104.08\" font-family=\"Times,serif\" font-size=\"14.00\">相信-01</text>\n",
       "</g>\n",
       "<!-- 1&#45;&gt;3 -->\n",
       "<g id=\"edge3\" class=\"edge\">\n",
       "<title>1-&gt;3</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M71.26,-178.82C79.52,-166.41 90.71,-149.59 100.01,-135.6\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"103.18,-137.16 105.81,-126.89 97.35,-133.28 103.18,-137.16\"/>\n",
       "<text text-anchor=\"middle\" x=\"104.03\" y=\"-149.17\" font-family=\"Times,serif\" font-size=\"14.00\">arg1</text>\n",
       "</g>\n",
       "<!-- 3&#45;&gt;0 -->\n",
       "<g id=\"edge4\" class=\"edge\">\n",
       "<title>3-&gt;0</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M101.14,-90.47C88.07,-77.07 69.64,-58.15 55.16,-43.31\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"57.49,-40.69 48.01,-35.97 52.48,-45.57 57.49,-40.69\"/>\n",
       "<text text-anchor=\"middle\" x=\"92.03\" y=\"-59.98\" font-family=\"Times,serif\" font-size=\"14.00\">arg1</text>\n",
       "</g>\n",
       "<!-- 2 -->\n",
       "<g id=\"node5\" class=\"node\">\n",
       "<title>2</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" cx=\"117.53\" cy=\"-19.09\" rx=\"32.55\" ry=\"19.18\"/>\n",
       "<text text-anchor=\"start\" x=\"104.53\" y=\"-14.89\" font-family=\"Times,serif\" font-size=\"14.00\">女孩</text>\n",
       "</g>\n",
       "<!-- 3&#45;&gt;2 -->\n",
       "<g id=\"edge5\" class=\"edge\">\n",
       "<title>3-&gt;2</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M117.53,-88.79C117.53,-77.03 117.53,-61.59 117.53,-48.32\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"121.03,-48.26 117.53,-38.26 114.03,-48.26 121.03,-48.26\"/>\n",
       "<text text-anchor=\"middle\" x=\"130.03\" y=\"-59.98\" font-family=\"Times,serif\" font-size=\"14.00\">arg0</text>\n",
       "</g>\n",
       "</g>\n",
       "</svg>"
      ],
      "text/plain": [
       "<IPython.core.display.SVG object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from IPython.display import SVG, display\n",
    "\n",
    "def show_svg(g):\n",
    "    display(SVG(data=g['svg']))\n",
    "    \n",
    "graph = HanLP.abstract_meaning_representation('男孩希望女孩相信他。', visualization='svg')[0]\n",
    "show_svg(graph)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 多语种支持\n",
    "除了中文外，支持的语言列表："
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 英文\n",
    "目前，HanLP服务器还支持英文AMR："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" width=\"192pt\" height=\"298pt\" viewBox=\"0.00 0.00 191.82 297.55\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 293.55)\">\n",
       "<title>0</title>\n",
       "<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-293.55 187.82,-293.55 187.82,4 -4,4\"/>\n",
       "<!-- top -->\n",
       "<!-- 1 -->\n",
       "<g id=\"node2\" class=\"node\">\n",
       "<title>1</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" cx=\"46.67\" cy=\"-197.46\" rx=\"46.84\" ry=\"19.18\"/>\n",
       "<text text-anchor=\"start\" x=\"24.17\" y=\"-193.26\" font-family=\"Times,serif\" font-size=\"14.00\">want-01</text>\n",
       "</g>\n",
       "<!-- top&#45;&gt;1 -->\n",
       "<g id=\"edge1\" class=\"edge\">\n",
       "<title>top-&gt;1</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M46.67,-253.47C46.67,-245.51 46.67,-235.82 46.67,-226.81\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"50.17,-226.68 46.67,-216.68 43.17,-226.68 50.17,-226.68\"/>\n",
       "</g>\n",
       "<!-- 0 -->\n",
       "<g id=\"node3\" class=\"node\">\n",
       "<title>0</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" cx=\"41.67\" cy=\"-19.09\" rx=\"29.9\" ry=\"19.18\"/>\n",
       "<text text-anchor=\"start\" x=\"31.17\" y=\"-14.89\" font-family=\"Times,serif\" font-size=\"14.00\">boy</text>\n",
       "</g>\n",
       "<!-- 1&#45;&gt;0 -->\n",
       "<g id=\"edge2\" class=\"edge\">\n",
       "<title>1-&gt;0</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M44.32,-178.31C42.65,-164.4 40.55,-144.71 39.67,-127.37 38.32,-100.63 39.11,-70.1 40.07,-48.47\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"43.58,-48.48 40.56,-38.33 36.58,-48.15 43.58,-48.48\"/>\n",
       "<text text-anchor=\"middle\" x=\"52.17\" y=\"-104.58\" font-family=\"Times,serif\" font-size=\"14.00\">arg0</text>\n",
       "</g>\n",
       "<!-- 3 -->\n",
       "<g id=\"node4\" class=\"node\">\n",
       "<title>3</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" cx=\"128.67\" cy=\"-108.28\" rx=\"55.31\" ry=\"19.18\"/>\n",
       "<text text-anchor=\"start\" x=\"99.67\" y=\"-104.08\" font-family=\"Times,serif\" font-size=\"14.00\">believe-01</text>\n",
       "</g>\n",
       "<!-- 1&#45;&gt;3 -->\n",
       "<g id=\"edge3\" class=\"edge\">\n",
       "<title>1-&gt;3</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M62.87,-179.23C74.92,-166.42 91.55,-148.74 105.08,-134.36\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"107.92,-136.45 112.22,-126.76 102.82,-131.65 107.92,-136.45\"/>\n",
       "<text text-anchor=\"middle\" x=\"105.17\" y=\"-149.17\" font-family=\"Times,serif\" font-size=\"14.00\">arg1</text>\n",
       "</g>\n",
       "<!-- 3&#45;&gt;0 -->\n",
       "<g id=\"edge4\" class=\"edge\">\n",
       "<title>3-&gt;0</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M111.48,-90.05C97.99,-76.53 79.08,-57.58 64.35,-42.82\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"66.61,-40.13 57.07,-35.53 61.66,-45.08 66.61,-40.13\"/>\n",
       "<text text-anchor=\"middle\" x=\"103.17\" y=\"-59.98\" font-family=\"Times,serif\" font-size=\"14.00\">arg1</text>\n",
       "</g>\n",
       "<!-- 2 -->\n",
       "<g id=\"node5\" class=\"node\">\n",
       "<title>2</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" cx=\"128.67\" cy=\"-19.09\" rx=\"28.07\" ry=\"19.18\"/>\n",
       "<text text-anchor=\"start\" x=\"119.17\" y=\"-14.89\" font-family=\"Times,serif\" font-size=\"14.00\">girl</text>\n",
       "</g>\n",
       "<!-- 3&#45;&gt;2 -->\n",
       "<g id=\"edge5\" class=\"edge\">\n",
       "<title>3-&gt;2</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M128.67,-88.79C128.67,-77.03 128.67,-61.59 128.67,-48.32\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"132.17,-48.26 128.67,-38.26 125.17,-48.26 132.17,-48.26\"/>\n",
       "<text text-anchor=\"middle\" x=\"141.17\" y=\"-59.98\" font-family=\"Times,serif\" font-size=\"14.00\">arg0</text>\n",
       "</g>\n",
       "</g>\n",
       "</svg>"
      ],
      "text/plain": [
       "<IPython.core.display.SVG object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "graph = HanLP.abstract_meaning_representation('The boy wants the girl to believe him.',\n",
    "                                      language='en', visualization='svg')[0]\n",
    "show_svg(graph)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "用户可以通过指定`language`参数来实现英文抽象意义表示的分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': '0',\n",
       " 'input': 'The boy wants the girl to believe him .',\n",
       " 'nodes': [{'id': 0, 'label': 'boy'},\n",
       "  {'id': 1, 'label': 'wants-01'},\n",
       "  {'id': 2, 'label': 'girl'},\n",
       "  {'id': 3, 'label': 'believe-01'}],\n",
       " 'edges': [{'source': 3, 'target': 0, 'label': 'arg1'},\n",
       "  {'source': 1, 'target': 3, 'label': 'arg1'},\n",
       "  {'source': 3, 'target': 2, 'label': 'arg0'},\n",
       "  {'source': 1, 'target': 0, 'label': 'arg0'}],\n",
       " 'tops': [1],\n",
       " 'framework': 'amr'}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.abstract_meaning_representation(tokens=[['The', 'boy', 'wants', 'the', 'girl', 'to', 'believe', 'him', '.']], \n",
    "                                            language='en')[0]"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "amr_stl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/amr_stl.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/amr_stl.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Famr_stl.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp[amr] -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "4M7ka0K5OMWU",
    "outputId": "d74f0749-0587-454a-d7c9-7418d45ce534"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'AMR3_SEQ2SEQ_BART_LARGE': 'https://file.hankcs.com/hanlp/amr/amr3_seq2seq_bart_large_83.30_20220125_114450.zip',\n",
       " 'MRP2020_AMR_ENG_ZHO_XLM_BASE': 'http://download.hanlp.com/amr/extra/amr-eng-zho-xlm-roberta-base_20220412_223756.zip',\n",
       " 'MRP2020_AMR_ZHO_MENGZI_BASE': 'http://download.hanlp.com/amr/extra/amr-zho-mengzi-base_20220415_101941.zip'}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.amr.ALL # 语种见名称最后一个字段或相应语料库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "BMW528wGNulM"
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "0tmKBu7sNAXX",
    "outputId": "df2de87b-27f5-4c72-8eb2-25ceefdd8270"
   },
   "outputs": [],
   "source": [
    "amr = hanlp.load('MRP2020_AMR_ENG_ZHO_XLM_BASE')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 抽象意义表示\n",
    "抽象意义表示任务的输入为一个或多个句子，`MRP2020_AMR_ENG_ZHO_XLM_BASE`要求提供分词完毕的句子："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "936d439a-e1ff-4308-d2aa-775955558594"
   },
   "outputs": [],
   "source": [
    "graph = amr([\"男孩\", \"希望\", \"女孩\", \"相信\", \"他\", \"。\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "jj1Jk-2sPHYx"
   },
   "source": [
    "返回对象为[penman.Graph](https://penman.readthedocs.io/en/latest/api/penman.graph.html)类型："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AMRGraph object (top=x2) at 12603529872>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "graph"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "打印时为友好格式："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(x2 / 希望-01\n",
      "    :arg1 (x4 / 相信-01\n",
      "              :arg0 (x3 / 女孩)\n",
      "              :arg1 x1)\n",
      "    :arg0 (x1 / 男孩))\n"
     ]
    }
   ],
   "source": [
    "print(graph)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "该AMR的可视化结果为：\n",
    "\n",
    "![amr-zh](https://hanlp.hankcs.com/backend/v2/amr_svg?tokens=%E7%94%B7%E5%AD%A9%20%E5%B8%8C%E6%9C%9B%20%E5%A5%B3%E5%AD%A9%20%E7%9B%B8%E4%BF%A1%20%E4%BB%96%20%E3%80%82&language=zh&scale=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`MRP2020_AMR_ENG_ZHO_XLM_BASE`其实是一个Meaning Representation Parsing模型，支持输出Meaning Representation（MR）格式，该格式比AMR的表达力更强："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': '0',\n",
       " 'input': '男孩 希望 女孩 相信 他 。',\n",
       " 'nodes': [{'id': 0,\n",
       "   'label': '男孩',\n",
       "   'anchors': [{'from': 0, 'to': 2}, {'from': 12, 'to': 13}]},\n",
       "  {'id': 1, 'label': '希望-01', 'anchors': [{'from': 3, 'to': 5}]},\n",
       "  {'id': 2, 'label': '女孩', 'anchors': [{'from': 6, 'to': 8}]},\n",
       "  {'id': 3, 'label': '相信-01', 'anchors': [{'from': 9, 'to': 11}]}],\n",
       " 'edges': [{'source': 1, 'target': 3, 'label': 'arg1'},\n",
       "  {'source': 1, 'target': 0, 'label': 'arg0'},\n",
       "  {'source': 3, 'target': 2, 'label': 'arg0'},\n",
       "  {'source': 3, 'target': 0, 'label': 'arg1'}],\n",
       " 'tops': [1],\n",
       " 'framework': 'amr'}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "amr([\"男孩\", \"希望\", \"女孩\", \"相信\", \"他\", \"。\"], output_amr=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "注意上面“男孩”有2个anchor，分别对应“男孩”和“他”。也就是说，MR格式其实包含了指代消解的结果。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 多语种支持\n",
    "`MRP2020_AMR_ENG_ZHO_XLM_BASE`同时还是一个Cross-Lingual模型，支持的语言列表："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['amr', 'eng'], ['amr', 'zho']]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "amr.config.frameworks"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "用户可以通过指定language参数来实现英文抽象意义表示的分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(w1 / wants-01\n",
      "    :arg1 (b2 / believe-01\n",
      "              :arg0 (g1 / girl)\n",
      "              :arg1 b1)\n",
      "    :arg0 (b1 / boy))\n"
     ]
    }
   ],
   "source": [
    "print(amr(['The', 'boy', 'wants', 'the', 'girl', 'to', 'believe', 'him', '.'], language='eng'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "为了达到最佳效果，建议同时提供每个词的词干："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(w1 / want-01\n",
      "    :arg1 (b2 / believe-01\n",
      "              :arg0 (g1 / girl)\n",
      "              :arg1 b1)\n",
      "    :arg0 (b1 / boy))\n"
     ]
    }
   ],
   "source": [
    "print(amr([('The', 'the'), ('boy', 'boy'), ('wants', 'want'), ('the', 'the'), ('girl', 'girl'), ('to', 'to'),\n",
    "              ('believe', 'believe'), ('him', 'he'), ('.', '.')], language='eng'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "该AMR的可视化结果为：\n",
    "\n",
    "![amr-en](https://hanlp.hankcs.com/backend/v2/amr_svg?tokens=The%20boy%20wants%20the%20girl%20to%20believe%20him%20.&language=en&scale=1)"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "amr_stl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/classification_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/classification_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fclassification_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "nf9TgeCTC0OT"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "jaW4eu6kC0OU",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "!pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_xI_bLAaC0OU"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "IYwV-UkNNzFp",
    "outputId": "54065443-9b0a-444c-f6c0-c701bc86400b",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "1Uf_u7ddMhUt",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 文本分类\n",
    "文本分类任务的输入为文档以及分类模型，以新闻领域的`news_zh`为例："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "id": "BqEmDMGGOtk3"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'科技'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.text_classification('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', model='news_zh')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "SwaPn1hjC0OW"
   },
   "source": [
    "返回值为文档最可能的类目。HanLP支持返回类目对应的概率（置信度）："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "egpWwHKxC0OX",
    "outputId": "f7c77687-dd75-4fa2-dbd2-be6bda8a3fff"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['科技', 0.999642014503479]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.text_classification('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', model='news_zh', prob=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "kq_j5TLFC0OX"
   },
   "source": [
    "HanLP也支持返回概率最高的`topk`个类目："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "isJhzYyIC0OX",
    "outputId": "683c8489-dffc-426e-f95b-e91dfb373260"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['科技', '家居']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.text_classification('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', model='news_zh', topk=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "该功能对于混合了多个主题的文档而言特别实用："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'时尚': 0.6342714428901672,\n",
       " '家居': 0.359315425157547,\n",
       " '科技': 0.0013340614968910813,\n",
       " '体育': 0.001275017624720931,\n",
       " '房产': 0.0010209722677245736,\n",
       " '娱乐': 0.0006360886618494987,\n",
       " '财经': 0.0005668793455697596,\n",
       " '游戏': 0.00037119409535080194,\n",
       " '教育': 0.00029694309341721237,\n",
       " '股票': 0.0002858955995179713,\n",
       " '星座': 0.0002288677787873894,\n",
       " '彩票': 0.00022682634880766273,\n",
       " '时政': 0.0001005345256999135,\n",
       " '社会': 6.985480285948142e-05}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = '''\n",
    "改了好几次，感觉终于可以确定了。\n",
    "这次的真丝是做了古董感的米金色染色，法蕾也做了同样的颜色。\n",
    "真丝软糯的手感和温柔的光泽感，在即将结束的冬天，显得格外的美好。\n",
    "'''\n",
    "\n",
    "HanLP.text_classification(text, model='news_zh', topk=True, prob=True)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "classification_restful.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}

================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/con_mtl.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/con_mtl.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fcon_mtl.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n",
       " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n",
       " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n",
       " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n",
       " 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.mtl.ALL # MTL多任务，具体任务见模型名称，语种见名称最后一个字段或相应语料库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EmZDmLn9aGxG"
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存。自然语言处理分为许多任务，分词只是最初级的一个。与其每个任务单独创建一个模型，不如利用HanLP的联合模型一次性完成多个任务："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 短语句法分析\n",
    "任务越少，速度越快。如指定仅执行短语句法分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
   },
   "outputs": [],
   "source": [
    "doc = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'], tasks='con')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"tok/fine\": [\n",
      "    [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n",
      "    [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]\n",
      "  ],\n",
      "  \"con\": [\n",
      "    [\"TOP\", [[\"IP\", [[\"NP\", [[\"_\", [\"2021年\"]]]], [\"NP\", [[\"_\", [\"HanLPv2.1\"]]]], [\"VP\", [[\"PP\", [[\"_\", [\"为\"]], [\"NP\", [[\"_\", [\"生产\"]], [\"_\", [\"环境\"]]]]]], [\"VP\", [[\"_\", [\"带来\"]], [\"NP\", [[\"ADJP\", [[\"NP\", [[\"ADJP\", [[\"_\", [\"次\"]]]], [\"NP\", [[\"_\", [\"世代\"]]]]]], [\"ADVP\", [[\"_\", [\"最\"]]]], [\"VP\", [[\"_\", [\"先进\"]]]]]], [\"_\", [\"的\"]], [\"NP\", [[\"QP\", [[\"_\", [\"多\"]]]], [\"NP\", [[\"_\", [\"语种\"]]]]]], [\"NP\", [[\"_\", [\"NLP\"]], [\"_\", [\"技术\"]]]]]]]]]], [\"_\", [\"。\"]]]]]],\n",
      "    [\"TOP\", [[\"IP\", [[\"NP\", [[\"_\", [\"阿婆主\"]]]], [\"VP\", [[\"VP\", [[\"_\", [\"来到\"]], [\"NP\", [[\"_\", [\"北京\"]], [\"_\", [\"立方庭\"]]]]]], [\"VP\", [[\"_\", [\"参观\"]], [\"NP\", [[\"_\", [\"自然\"]], [\"_\", [\"语义\"]], [\"_\", [\"科技\"]], [\"_\", [\"公司\"]]]]]]]], [\"_\", [\"。\"]]]]]]\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`doc['con']`为Tree类型，是list的子类。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "wxctCigrTKu-"
   },
   "source": [
    "可视化短语句法树："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Zo08uquCTFSk",
    "outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Token&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">P&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;9&nbsp;<br>───────────────────────────────────────────────────────<br>_───────────────────────────────────────────►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;<br>_───────────────────────────────────────────►NP────┤&nbsp;&nbsp;&nbsp;<br>_──────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├────────────────────────►PP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──┴►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───►ADJP──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP────┤&nbsp;&nbsp;&nbsp;<br>_───►NP&nbsp;───┴►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───────────►ADVP──┼►ADJP──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP<br>_───────────►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────┤&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───►QP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───►NP&nbsp;───┴────────►NP────┤&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──┴────────────────►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────────────────────────────┘&nbsp;&nbsp;&nbsp;</pre></div><br><div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;<br>───&nbsp;<br>阿婆主&nbsp;<br>来到&nbsp;&nbsp;<br>北京&nbsp;&nbsp;<br>立方庭&nbsp;<br>参观&nbsp;&nbsp;<br>自然&nbsp;&nbsp;<br>语义&nbsp;&nbsp;<br>科技&nbsp;&nbsp;<br>公司&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">P&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6&nbsp;<br>───────────────────────────────<br>_───────────────────►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;<br>_──────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──┴►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP────┤&nbsp;&nbsp;&nbsp;<br>_──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP<br>_&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_&nbsp;&nbsp;├►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────┘&nbsp;&nbsp;&nbsp;</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "doc.pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "将第一个短语树转换为bracketed格式："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(TOP\n",
      "  (IP\n",
      "    (NP (_ 2021年))\n",
      "    (NP (_ HanLPv2.1))\n",
      "    (VP\n",
      "      (PP (_ 为) (NP (_ 生产) (_ 环境)))\n",
      "      (VP\n",
      "        (_ 带来)\n",
      "        (NP\n",
      "          (ADJP\n",
      "            (NP (ADJP (_ 次)) (NP (_ 世代)))\n",
      "            (ADVP (_ 最))\n",
      "            (VP (_ 先进)))\n",
      "          (_ 的)\n",
      "          (NP (QP (_ 多)) (NP (_ 语种)))\n",
      "          (NP (_ NLP) (_ 技术)))))\n",
      "    (_ 。)))\n"
     ]
    }
   ],
   "source": [
    "print(doc['con'][0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "将第一个短语树转换为list格式："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['TOP',\n",
       " [['IP',\n",
       "   [['NP', [['_', ['2021年']]]],\n",
       "    ['NP', [['_', ['HanLPv2.1']]]],\n",
       "    ['VP',\n",
       "     [['PP', [['_', ['为']], ['NP', [['_', ['生产']], ['_', ['环境']]]]]],\n",
       "      ['VP',\n",
       "       [['_', ['带来']],\n",
       "        ['NP',\n",
       "         [['ADJP',\n",
       "           [['NP', [['ADJP', [['_', ['次']]]], ['NP', [['_', ['世代']]]]]],\n",
       "            ['ADVP', [['_', ['最']]]],\n",
       "            ['VP', [['_', ['先进']]]]]],\n",
       "          ['_', ['的']],\n",
       "          ['NP', [['QP', [['_', ['多']]]], ['NP', [['_', ['语种']]]]]],\n",
       "          ['NP', [['_', ['NLP']], ['_', ['技术']]]]]]]]]],\n",
       "    ['_', ['。']]]]]]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "doc['con'][0].to_list()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "XOsWkOqQfzlr"
   },
   "source": [
    "为已分词的句子执行短语句法分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "bLZSTbv_f3OA",
    "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Token&nbsp;<br>─────&nbsp;<br>hanlp&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;<br>次世代&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进&nbsp;&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多语种&nbsp;&nbsp;&nbsp;<br>nlp&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">P&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;9&nbsp;<br>───────────────────────────────────────────────────────<br>_───────────────────────────────────────────►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;<br>_──────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├────────────────────────►PP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──┴►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP────┤&nbsp;&nbsp;&nbsp;<br>_───►ADVP──┼►VP&nbsp;────►IP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP<br>_───►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────┤&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───────────────────►NP────┼►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───────────────────►NP────┤&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───────────────────►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────────────────────────────┘&nbsp;&nbsp;&nbsp;</pre></div><br><div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;<br>───&nbsp;<br>我&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;<br>希望&nbsp;&nbsp;<br>是&nbsp;&nbsp;&nbsp;<br>希望&nbsp;&nbsp;<br>张晚霞&nbsp;<br>的&nbsp;&nbsp;&nbsp;<br>背影&nbsp;&nbsp;<br>被&nbsp;&nbsp;&nbsp;<br>晚霞&nbsp;&nbsp;<br>映红&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">P&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;10&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;11<br>───────────────────────────────────────────────────────────────────────<br>_───►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>_──────────┴►DNP&nbsp;──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>_───────────►NP&nbsp;───┴────────────────────────────────────────►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;<br>_──────────────────────────────────────────────────────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────────────────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP────┤&nbsp;&nbsp;&nbsp;<br>_──────────┴►DNP&nbsp;──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;────►IP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───────────►NP&nbsp;───┴────────►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP<br>_──────────────────────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───►VP&nbsp;───┴►IP&nbsp;────►CP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────────────────────────────────────────────┘&nbsp;&nbsp;&nbsp;</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "HanLP([\n",
    "    [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
    "    [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
    "  ], tasks='con', skip_tasks='tok*').pretty_print()"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "con_mtl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/con_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/con_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fcon_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EmZDmLn9aGxG"
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 短语句法分析\n",
    "任务越少，速度越快。如指定仅执行短语句法分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
   },
   "outputs": [],
   "source": [
    "doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', tasks='con')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"tok/fine\": [\n",
      "    [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"]\n",
      "  ],\n",
      "  \"con\": [\n",
      "    [\"TOP\", [[\"IP\", [[\"NP\", [[\"_\", [\"2021年\"]]]], [\"NP\", [[\"_\", [\"HanLPv2.1\"]]]], [\"VP\", [[\"PP\", [[\"_\", [\"为\"]], [\"NP\", [[\"_\", [\"生产\"]], [\"_\", [\"环境\"]]]]]], [\"VP\", [[\"_\", [\"带来\"]], [\"NP\", [[\"IP\", [[\"VP\", [[\"NP\", [[\"QP\", [[\"CLP\", [[\"_\", [\"次\"]]]]]], [\"NP\", [[\"_\", [\"世代\"]]]]]], [\"ADVP\", [[\"_\", [\"最\"]]]], [\"VP\", [[\"_\", [\"先进\"]]]]]]]], [\"_\", [\"的\"]], [\"NP\", [[\"QP\", [[\"_\", [\"多\"]]]], [\"NP\", [[\"_\", [\"语种\"]]]]]], [\"NP\", [[\"_\", [\"NLP\"]], [\"_\", [\"技术\"]]]]]]]]]], [\"_\", [\"。\"]]]]]]\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`doc['con']`为Tree类型，是list的子类。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "wxctCigrTKu-"
   },
   "source": [
    "可视化短语句法树："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Zo08uquCTFSk",
    "outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Token&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">P&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;10&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;11<br>───────────────────────────────────────────────────────────────────────<br>_───────────────────────────────────────────────────────────►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;<br>_───────────────────────────────────────────────────────────►NP────┤&nbsp;&nbsp;&nbsp;<br>_──────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├────────────────────────────────────────►PP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──┴►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────────────────────────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───►CLP&nbsp;───►QP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP────┤&nbsp;&nbsp;&nbsp;<br>_───────────►NP&nbsp;───┴►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───────────────────►ADVP──┼►VP&nbsp;────►IP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP<br>_───────────────────►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────────────────────┤&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───►QP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───►NP&nbsp;───┴────────────────────────►NP────┤&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──┴────────────────────────────────►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────────────────────────────────────────────┘&nbsp;&nbsp;&nbsp;</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "doc.pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "转换为bracketed格式："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(TOP\n",
      "  (IP\n",
      "    (NP (_ 2021年))\n",
      "    (NP (_ HanLPv2.1))\n",
      "    (VP\n",
      "      (PP (_ 为) (NP (_ 生产) (_ 环境)))\n",
      "      (VP\n",
      "        (_ 带来)\n",
      "        (NP\n",
      "          (IP\n",
      "            (VP\n",
      "              (NP (QP (CLP (_ 次))) (NP (_ 世代)))\n",
      "              (ADVP (_ 最))\n",
      "              (VP (_ 先进))))\n",
      "          (_ 的)\n",
      "          (NP (QP (_ 多)) (NP (_ 语种)))\n",
      "          (NP (_ NLP) (_ 技术)))))\n",
      "    (_ 。)))\n"
     ]
    }
   ],
   "source": [
    "print(doc['con'][0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "XOsWkOqQfzlr"
   },
   "source": [
    "为已分词的句子执行短语句法分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "bLZSTbv_f3OA",
    "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Token&nbsp;<br>─────&nbsp;<br>hanlp&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;<br>次世代&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进&nbsp;&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多语种&nbsp;&nbsp;&nbsp;<br>nlp&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">P&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;10&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;11&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;12<br>───────────────────────────────────────────────────────────────────────────────<br>_───────────────────────────────────────────────────────────────────►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;<br>_──────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├────────────────────────────────────────────────►PP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──┴►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────────────────────────────────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───────────►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP────┤&nbsp;&nbsp;&nbsp;<br>_───►ADVP──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;────►IP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP<br>_───►VP&nbsp;───┴►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►CP&nbsp;────►CP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────────────┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────────────────────────────┼►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───►NP&nbsp;───┴────────────────────────────────►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────────────────────────────────────────────────────┘&nbsp;&nbsp;&nbsp;</pre></div><br><div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;<br>───&nbsp;<br>我&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;<br>希望&nbsp;&nbsp;<br>是&nbsp;&nbsp;&nbsp;<br>希望&nbsp;&nbsp;<br>张晚霞&nbsp;<br>的&nbsp;&nbsp;&nbsp;<br>背影&nbsp;&nbsp;<br>被&nbsp;&nbsp;&nbsp;<br>晚霞&nbsp;&nbsp;<br>映红&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">P&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;10&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;11<br>───────────────────────────────────────────────────────────────────────<br>_───►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>_──────────┴►DNP&nbsp;──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>_───────────►NP&nbsp;───┴────────────────────────────────────────►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;<br>_──────────────────────────────────────────────────────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────────────────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP────┤&nbsp;&nbsp;&nbsp;<br>_──────────┴►DNP&nbsp;──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;────►IP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───────────►NP&nbsp;───┴────────►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP<br>_──────────────────────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_───►VP&nbsp;───┴►IP&nbsp;────►CP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>_──────────────────────────────────────────────────────────────────┘&nbsp;&nbsp;&nbsp;</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "HanLP(tokens=[\n",
    "    [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
    "    [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
    "  ], tasks='con').pretty_print()"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "con_restful.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/con_stl.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/con_stl.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fcon_stl.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'CTB9_CON_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/constituency/ctb9_con_electra_small_20220215_230116.zip',\n",
       " 'CTB9_CON_FULL_TAG_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/constituency/ctb9_full_tag_con_electra_small_20220118_103119.zip'}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.constituency.ALL # 语种见名称最后一个字段或相应语料库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EmZDmLn9aGxG"
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "con = hanlp.load('CTB9_CON_FULL_TAG_ELECTRA_SMALL')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 短语句法分析\n",
    "输入为已分词的一个或多个句子："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
   },
   "outputs": [],
   "source": [
    "trees = con([[\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"], [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]], tasks='con')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "返回值为一个`Tree`的数组:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['TOP', [['IP', [['NP-TMP', [['_', ['2021年']]]], ['NP-PN-SBJ', [['_', ['HanLPv2.1']]]], ['VP', [['PP-BNF', [['_', ['为']], ['NP', [['_', ['生产']], ['_', ['环境']]]]]], ['VP', [['_', ['带来']], ['NP-OBJ', [['CP', [['CP', [['IP', [['VP', [['NP', [['DP', [['_', ['次']]]], ['NP', [['_', ['世代']]]]]], ['ADVP', [['_', ['最']]]], ['VP', [['_', ['先进']]]]]]]], ['_', ['的']]]]]], ['NP', [['QP', [['_', ['多']]]], ['NP', [['_', ['语种']]]]]], ['NP', [['_', ['NLP']], ['_', ['技术']]]]]]]]]], ['_', ['。']]]]]], ['TOP', [['IP', [['NP-SBJ', [['_', ['阿婆主']]]], ['VP', [['VP', [['_', ['来到']], ['NP-OBJ', [['_', ['北京']], ['NP-PN', [['_', ['立方庭']]]]]]]], ['VP', [['_', ['参观']], ['NP-OBJ', [['_', ['自然']], ['_', ['语义']], ['_', ['科技']], ['_', ['公司']]]]]]]], ['_', ['。']]]]]]]\n"
     ]
    }
   ],
   "source": [
    "print(trees)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "转换为bracketed格式："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(TOP\n",
      "  (IP\n",
      "    (NP-TMP (_ 2021年))\n",
      "    (NP-PN-SBJ (_ HanLPv2.1))\n",
      "    (VP\n",
      "      (PP-BNF (_ 为) (NP (_ 生产) (_ 环境)))\n",
      "      (VP\n",
      "        (_ 带来)\n",
      "        (NP-OBJ\n",
      "          (CP\n",
      "            (CP\n",
      "              (IP\n",
      "                (VP\n",
      "                  (NP (DP (_ 次)) (NP (_ 世代)))\n",
      "                  (ADVP (_ 最))\n",
      "                  (VP (_ 先进))))\n",
      "              (_ 的)))\n",
      "          (NP (QP (_ 多)) (NP (_ 语种)))\n",
      "          (NP (_ NLP) (_ 技术)))))\n",
      "    (_ 。)))\n"
     ]
    }
   ],
   "source": [
    "print(trees[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 组装流水线"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "短语成分树的第一层non-terminal一般是词性标签，所以经常与词性标注一起使用。为此，先加载一个词性标注器："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "然后创建一个函数将词性标签和句法树组装起来:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from hanlp_common.document import Document\n",
    "def merge_pos_into_con(doc:Document):\n",
    "    flat = isinstance(doc['pos'][0], str)\n",
    "    if flat:\n",
    "        doc = Document((k, [v]) for k, v in doc.items())\n",
    "    for tree, tags in zip(doc['con'], doc['pos']):\n",
    "        offset = 0\n",
    "        for subtree in tree.subtrees(lambda t: t.height() == 2):\n",
    "            tag = subtree.label()\n",
    "            if tag == '_':\n",
    "                subtree.set_label(tags[offset])\n",
    "            offset += 1\n",
    "    if flat:\n",
    "        doc = doc.squeeze()\n",
    "    return doc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "之后就可以用一个流水线将三者组装起来了："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "nlp = hanlp.pipeline() \\\n",
    "    .append(pos, input_key='tok', output_key='pos') \\\n",
    "    .append(con, input_key='tok', output_key='con') \\\n",
    "    .append(merge_pos_into_con, input_key='*')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "该流水线的结构如下："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[tok->TransformerTagger->pos, tok->CRFConstituencyParser->con, None->merge_pos_into_con->None]\n"
     ]
    }
   ],
   "source": [
    "print(nlp)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "传入一个已分词的句子试试："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"tok\": [\n",
      "    \"2021年\",\n",
      "    \"HanLPv2.1\",\n",
      "    \"带来\",\n",
      "    \"最\",\n",
      "    \"先进\",\n",
      "    \"的\",\n",
      "    \"多\",\n",
      "    \"语种\",\n",
      "    \"NLP\",\n",
      "    \"技术\",\n",
      "    \"。\"\n",
      "  ],\n",
      "  \"pos\": [\n",
      "    \"NT\",\n",
      "    \"NR\",\n",
      "    \"VV\",\n",
      "    \"AD\",\n",
      "    \"VA\",\n",
      "    \"DEC\",\n",
      "    \"CD\",\n",
      "    \"NN\",\n",
      "    \"NR\",\n",
      "    \"NN\",\n",
      "    \"PU\"\n",
      "  ],\n",
      "  \"con\": [\n",
      "    \"TOP\",\n",
      "    [[\"IP\", [[\"NP-TMP\", [[\"NT\", [\"2021年\"]]]], [\"NP-PN-SBJ\", [[\"NR\", [\"HanLPv2.1\"]]]], [\"VP\", [[\"VV\", [\"带来\"]], [\"NP-OBJ\", [[\"CP\", [[\"CP\", [[\"IP\", [[\"VP\", [[\"ADVP\", [[\"AD\", [\"最\"]]]], [\"VP\", [[\"VA\", [\"先进\"]]]]]]]], [\"DEC\", [\"的\"]]]]]], [\"NP\", [[\"QP\", [[\"CD\", [\"多\"]]]], [\"NP\", [[\"NN\", [\"语种\"]]]]]], [\"NP\", [[\"NR\", [\"NLP\"]], [\"NN\", [\"技术\"]]]]]]]], [\"PU\", [\"。\"]]]]]\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "doc = nlp(tok=[\"2021年\", \"HanLPv2.1\", \"带来\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"])\n",
    "print(doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "流水线的输出也是一个Document，所以支持可视化："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Token&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">PoS&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;10<br>────────────────────────────────────────────────────────────────────────<br>NT&nbsp;─────────────────────────────────────────────────────►NP-TMP&nbsp;────┐&nbsp;&nbsp;&nbsp;<br>NR&nbsp;─────────────────────────────────────────────────────►NP-PN-SBJ──┤&nbsp;&nbsp;&nbsp;<br>VV&nbsp;────────────────────────────────────────────────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>AD&nbsp;───►ADVP──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>VA&nbsp;───►VP&nbsp;───┴►VP&nbsp;────►IP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>DEC──────────────────────────┴►CP&nbsp;────►CP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP─────────┼►IP<br>CD&nbsp;───►QP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NN&nbsp;───►NP&nbsp;───┴────────────────────────►NP────┼►NP-OBJ──┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NR&nbsp;──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NN&nbsp;──┴────────────────────────────────►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>PU&nbsp;─────────────────────────────────────────────────────────────────┘&nbsp;&nbsp;&nbsp;</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "doc.pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "如果要分析原始文本的话，分词是第一步，所以先加载一个分词器："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "然后将分词器插入到流水线的第一级："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[None->TransformerTaggingTokenizer->tok,\n",
       " tok->TransformerTagger->pos,\n",
       " tok->CRFConstituencyParser->con,\n",
       " None->merge_pos_into_con->None]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nlp.insert(0, tok, output_key='tok')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "然后就可以直接分析原始文本了："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(TOP\n",
      "  (IP\n",
      "    (NT 2021)\n",
      "    (M 年)\n",
      "    (NP-PN-SBJ (NR HanLPv2.1))\n",
      "    (VP\n",
      "      (VV 带来)\n",
      "      (NP-OBJ\n",
      "        (CP (CP (IP (VP (ADVP (AD 最)) (VP (VA 先进)))) (DEC 的)))\n",
      "        (NP (QP (CD 多)) (NP (NN 语种)))\n",
      "        (NP (NR NLP) (NN 技术))))\n",
      "    (PU 。)))\n"
     ]
    }
   ],
   "source": [
    "print(nlp('2021年HanLPv2.1带来最先进的多语种NLP技术。')['con'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "你明白吗？HanLP是为聪明人设计的，只要你足够聪明，你就可以优雅地实现各种功能。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 操作短语树的技巧"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "短语结构树的类型为`phrasetree.tree.Tree`，提供了许多接口，此处列举其中一些常用的接口。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(TOP\n",
      "  (IP\n",
      "    (NP-TMP (NT 2021年))\n",
      "    (NP-PN-SBJ (NR HanLPv2.1))\n",
      "    (VP\n",
      "      (VV 带来)\n",
      "      (NP-OBJ\n",
      "        (CP (CP (IP (VP (ADVP (AD 最)) (VP (VA 先进)))) (DEC 的)))\n",
      "        (NP (QP (CD 多)) (NP (NN 语种)))\n",
      "        (NP (NR NLP) (NN 技术))))\n",
      "    (PU 。)))\n"
     ]
    }
   ],
   "source": [
    "tree = doc['con'] # tree数组的话则需要doc['con'][0]\n",
    "print(tree)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 按高度枚举子树"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "子树：(VP (ADVP (AD 最)) (VP (VA 先进)))\t标签：VP\t短语：['最', '先进']\n",
      "子树：(NP (QP (CD 多)) (NP (NN 语种)))\t标签：NP\t短语：['多', '语种']\n"
     ]
    }
   ],
   "source": [
    "for subtree in tree.subtrees(lambda t: t.height() == 4):\n",
    "    print(f'子树：{subtree}\\t标签：{subtree.label()}\\t短语：{subtree.leaves()}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 按标签枚举子树"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(NP (QP (CD 多)) (NP (NN 语种)))\n",
      "(NP (NN 语种))\n",
      "(NP (NR NLP) (NN 技术))\n"
     ]
    }
   ],
   "source": [
    "for subtree in tree.subtrees(lambda t: t.label() == 'NP'):\n",
    "    print(subtree)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 遍历子节点"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "父节点(NP (NR NLP) (NN 技术))的子节点有：\n",
      "(NR NLP)\n",
      "(NN 技术)\n"
     ]
    }
   ],
   "source": [
    "print(f'父节点{subtree}的子节点有：')\n",
    "for child in subtree:\n",
    "    print(child)"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "con_stl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}

================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/cor_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/cor_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fcor_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EmZDmLn9aGxG"
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 指代消解\n",
    "任务越少，速度越快。如指定仅执行指代消解："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
   },
   "outputs": [],
   "source": [
    "ret = HanLP.coreference_resolution('我姐送我她的猫。我很喜欢它。')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "返回值为一个包含分词结果与簇的dict:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ret == {'clusters': [\n",
    "              [['我', 0, 1], ['我', 3, 4], ['我', 8, 9]], # 指代说话人\n",
    "              [['我姐', 0, 2], ['她', 4, 5]],             # 指代说话人的姐姐\n",
    "              [['她的猫', 4, 7], ['它', 11, 12]]],        # 指代说话人的姐姐的猫\n",
    "        'tokens': ['我', '姐', '送', '我', '她', '的', '猫', '。', '我', '很', '喜欢', '它', '。']}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "wxctCigrTKu-"
   },
   "source": [
    "对应如下结构：\n",
    "![cor](https://file.hankcs.com/img/coref_demo_small.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "XOsWkOqQfzlr"
   },
   "source": [
    "为已分词的句子执行指代消解："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "bLZSTbv_f3OA",
    "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844"
   },
   "outputs": [],
   "source": [
    "clusters = HanLP.coreference_resolution(tokens=[['我', '姐', '送', '我', '她', '的', '猫', '。'],\n",
    "                                                ['我', '很', '喜欢', '它', '。']])\n",
    "             "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "返回值为簇的list："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clusters == [\n",
    "              [['我', 0, 1], ['我', 3, 4], ['我', 8, 9]], # 指代说话人\n",
    "              [['我姐', 0, 2], ['她', 4, 5]],             # 指代说话人的姐姐\n",
    "              [['她的猫', 4, 7], ['它', 11, 12]]]         # 指代说话人的姐姐的猫"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "cor_restful.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_amr.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-04-12 22:19
import hanlp

parser = hanlp.load(hanlp.pretrained.amr.MRP2020_AMR_ENG_ZHO_XLM_BASE)

# For Chinese:
print(parser(["男孩", "希望", "女孩", "相信", "他", "。"]))
print(parser(["男孩", "希望", "女孩", "相信", "他", "。"], output_amr=False))

# For English:
print(parser(['The', 'boy', 'wants', 'the', 'girl', 'to', 'believe', 'him', '.'], language='eng'))
# It's suggested to also feed the lemma for stabler performance.
print(parser([('The', 'the'), ('boy', 'boy'), ('wants', 'want'), ('the', 'the'), ('girl', 'girl'), ('to', 'to'),
              ('believe', 'believe'), ('him', 'he'), ('.', '.')], language='eng'))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_custom_dict.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-15 22:26
import hanlp
from hanlp.components.mtl.multi_task_learning import MultiTaskLearning
from hanlp.components.mtl.tasks.tok.tag_tok import TaggingTokenization

# 加载多任务模型
HanLP: MultiTaskLearning = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)
# 获取分词任务（以tok开头的任务都是分词任务，以细分标准为例）
tok: TaggingTokenization = HanLP['tok/fine']

tok.dict_force = tok.dict_combine = None
print(f'不挂词典:\n{HanLP("商品和服务项目")["tok/fine"]}')

tok.dict_force = {'和服', '服务项目'}
print(f'强制模式:\n{HanLP("商品和服务项目")["tok/fine"]}')  # 慎用，详见《自然语言处理入门》第二章

tok.dict_force = {'和服务': ['和', '服务']}
print(f'强制校正:\n{HanLP("正向匹配商品和服务、任何和服务必按上述切分")["tok/fine"]}')

tok.dict_force = None
tok.dict_combine = {'和服', '服务项目'}
print(f'合并模式:\n{HanLP("商品和服务项目")["tok/fine"]}')

# 需要算法基础才能理解，初学者可参考 http://nlp.hankcs.com/book.php
# See also https://hanlp.hankcs.com/docs/api/hanlp/components/tokenizers/transformer.html

# 含有空格、制表符等（Transformer tokenizer去掉的字符）的词语需要用tuple的形式提供
tok.dict_combine = {('iPad', 'Pro'), '2个空格'}
print(f'空格匹配：\n{HanLP("如何评价iPad Pro ？iPad  Pro有2个空格", tasks="tok/fine")["tok/fine"]}')
# 聪明的用户请继续阅读：tuple词典中的字符串其实等价于该字符串的所有可能的切分方式
print(f'词典内容：\n{dict(tok.dict_combine.config["dictionary"]).keys()}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_custom_dict_stl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-15 22:26
import hanlp
from hanlp.components.tokenizers.transformer import TransformerTaggingTokenizer

# 加载一个旧版本单任务模型演示分词错误（最新版已经修复）：
tok: TransformerTaggingTokenizer = hanlp.load('https://file.hankcs.com/hanlp/tok/coarse_electra_small_20220220_013548.zip')

tok.dict_force = tok.dict_combine = None
print(f'不挂词典:\n{tok("首相和川普通电话")}')

tok.dict_force = {'川普'}
print(f'强制模式:\n{tok(["首相和川普通电话", "银川普通人与川普通电话讲四川普通话"])}')  # 慎用，详见《自然语言处理入门》第二章

tok.dict_force = {'川普通电话': ['川普', '通', '电话']}
print(f'强制校正:\n{tok(["首相和川普通电话", "银川普通人与川普通电话讲四川普通话"])}')

tok.dict_force = None
tok.dict_combine = {'美国总统'}
print(f'合并模式:\n{tok("首相和川普通电话，川普是美国总统。")}')

# 需要算法基础才能理解，初学者可参考 http://nlp.hankcs.com/book.php
# See also https://hanlp.hankcs.com/docs/api/hanlp/components/tokenizers/transformer.html


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_del_tasks.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-02-03 13:28
import hanlp
from hanlp.components.mtl.multi_task_learning import MultiTaskLearning
from hanlp_common.document import Document

HanLP: MultiTaskLearning = hanlp.load(hanlp.pretrained.mtl.OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)
tasks = list(HanLP.tasks.keys())
print(tasks)  # Pick what you need from what we have
for task in tasks:
    if task not in ('tok', 'pos'):
        del HanLP[task]
# You can save it as a new component
# HanLP.save('path/to/new/component')
# HanLP.load('path/to/new/component')
print(HanLP.tasks.keys())
doc: Document = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', 'up主来到北京立方庭参观自然语义科技公司。'])
print(doc)
doc.pretty_print()


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_document.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-10-26 23:40
from hanlp_common.document import Document

# Create a document or get a document from HanLP.parse
doc = Document(
    tok=[["晓美焰", "来到", "北京", "立方庭", "参观", "自然", "语义", "科技", "公司"]],
    pos=[["NR", "VV", "NR", "NR", "VV", "NN", "NN", "NN", "NN"]],
    ner=[[["晓美焰", "PERSON", 0, 1], ["北京立方庭", "LOCATION", 2, 4],
          ["自然语义科技公司", "ORGANIZATION", 5, 9]]],
    dep=[[[2, "nsubj"], [0, "root"], [4, "name"], [2, "dobj"], [2, "conj"],
          [9, "compound"], [9, "compound"], [9, "compound"], [5, "dobj"]]]
)

# print(doc) or str(doc) to get its JSON representation
print(doc)

# Access an annotation by its task name
print(doc['tok'])

# Get number of sentences
print(f'It has {doc.count_sentences()} sentence(s)')

# Access the n-th sentence
print(doc.squeeze(0)['tok'])

# Pretty print it right in your console or notebook
doc.pretty_print()

# To save the pretty prints in a str
pretty_text: str = '\n\n'.join(doc.to_pretty())

# Create a document from a dict
doc = Document({
    "tok/fine": [
        ["晓美焰", "来到", "北京", "立方庭", "参观", "自然", "语义", "科技", "公司", "。"]
    ],
    "tok/coarse": [
        ["晓美焰", "来到", "北京立方庭", "参观", "自然语义科技公司", "。"]
    ],
    "pos/ctb": [
        ["NR", "VV", "NR", "NR", "VV", "NN", "NN", "NN", "NN", "PU"]
    ],
    "pos/pku": [
        ["nr", "v", "ns", "nz", "v", "n", "n", "n", "n", "w"]
    ],
    "ner/msra": [
        [["晓美焰", "PERSON", 0, 1], ["北京立方庭", "LOCATION", 2, 4], ["自然语义科技公司", "ORGANIZATION", 5, 9]]
    ],
    "ner/ontonotes": [
        [["晓美焰", "PERSON", 0, 1], ["北京", "GPE", 2, 3], ["立方庭", "FAC", 3, 4], ["自然语义科技公司", "ORG", 5, 9]]
    ],
    "srl": [
        [[["晓美焰", "ARG0", 0, 1], ["来到", "PRED", 1, 2], ["北京立方庭", "ARG1", 2, 4]],
         [["晓美焰", "ARG0", 0, 1], ["参观", "PRED", 4, 5], ["自然语义科技公司", "ARG1", 5, 9]]]
    ],
    "dep": [
        [[2, "nsubj"], [0, "root"], [4, "name"], [2, "dobj"], [2, "conj"], [9, "compound"], [9, "compound"],
         [9, "compound"], [5, "dobj"], [2, "punct"]]
    ]
})
# Pretty print using a different NER annotation
doc.pretty_print(ner='ner/ontonotes')
# Get the first annotation for NER
print(doc.get_by_prefix('ner'))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_mlm.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-01-29 21:11
from hanlp.components.lm.mlm import MaskedLanguageModel

mlm = MaskedLanguageModel()
mlm.load('bert-base-chinese')
print(mlm('生活的真谛是[MASK]。'))

# Batching is always faster
print(mlm(['生活的真谛是[MASK]。', '巴黎是[MASK][MASK]的首都。']))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_mtl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-31 13:51
import hanlp
from hanlp_common.document import Document

# CLOSE是自然语义标注的闭源语料库，BASE是中号模型，ZH中文
HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)
# 默认执行全部任务
doc: Document = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'])
# 返回类型Document是dict的子类，打印出来兼容JSON
print(doc)
# 即时可视化，防止换行请最大化窗口，推荐在Jupyter Notebook里调用
doc.pretty_print()
# 指定可视化OntoNotes标准的NER
# doc.pretty_print(ner='ner/ontonotes', pos='pku')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_ner_dict.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-04-29 11:06
import hanlp
from hanlp.components.mtl.tasks.ner.tag_ner import TaggingNamedEntityRecognition
from hanlp.utils.io_util import get_resource

HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH)
ner: TaggingNamedEntityRecognition = HanLP['ner/msra']
ner.dict_whitelist = {'午饭后': 'TIME'}
doc = HanLP('2021年测试高血压是138，时间是午饭后2点45，低血压是44', tasks='ner/msra')
doc.pretty_print()
print(doc['ner/msra'])

ner.dict_tags = {('名字', '叫', '金华'): ('O', 'O', 'S-PERSON')}
HanLP('他在浙江金华出生，他的名字叫金华。', tasks='ner/msra').pretty_print()

# HanLP.save(get_resource(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH))

# 需要算法基础才能理解，初学者可参考 http://nlp.hankcs.com/book.php
# See https://hanlp.hankcs.com/docs/api/hanlp/components/mtl/tasks/ner/tag_ner.html


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_parse_constituency.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-01-18 11:09
from hanlp_common.document import Document
import hanlp

con = hanlp.load(hanlp.pretrained.constituency.CTB9_CON_FULL_TAG_ELECTRA_SMALL)
# To speed up, parse multiple sentences at once, and use a GPU.
print(con(["2021年", "HanLPv2.1", "带来", "最", "先进", "的", "多", "语种", "NLP", "技术", "。"]))


# The rest of this tutorial is written for clever users.
# The first level of non-terminals are PoS tags. So usually a PoS model is piped.
def merge_pos_into_con(doc: Document):
    flat = isinstance(doc['pos'][0], str)
    if flat:
        doc = Document((k, [v]) for k, v in doc.items())
    for tree, tags in zip(doc['con'], doc['pos']):
        offset = 0
        for subtree in tree.subtrees(lambda t: t.height() == 2):
            tag = subtree.label()
            if tag == '_':
                subtree.set_label(tags[offset])
            offset += 1
    if flat:
        doc = doc.squeeze()
    return doc


pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
nlp = hanlp.pipeline() \
    .append(pos, input_key='tok', output_key='pos') \
    .append(con, input_key='tok', output_key='con') \
    .append(merge_pos_into_con, input_key='*')
print(f'The pipeline looks like this: {nlp}')
doc = nlp(tok=["2021年", "HanLPv2.1", "带来", "最", "先进", "的", "多", "语种", "NLP", "技术", "。"])
print(doc)
doc.pretty_print()

# If you need to parse raw text, simply add a tokenizer into this pipeline.
tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
nlp.insert(0, tok, output_key='tok')
print(f'The pipeline looks like this: {nlp}')
doc = nlp('2021年HanLPv2.1带来最先进的多语种NLP技术。')
print(doc)
doc.pretty_print()

# ATTENTION: Pipelines are usually slower than MTL but they are more flexible.


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_pipeline.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 20:47
import hanlp

# Pipeline allows blending multiple callable functions no matter they are a rule, a TensorFlow component or a PyTorch
# one. However, it's slower than the MTL framework.
# pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ALBERT_BASE)  # In case both tf and torch are used, load tf first.

HanLP = hanlp.pipeline() \
    .append(hanlp.utils.rules.split_sentence, output_key='sentences') \
    .append(hanlp.load('CTB9_TOK_ELECTRA_SMALL'), output_key='tok') \
    .append(hanlp.load('CTB9_POS_ELECTRA_SMALL'), output_key='pos') \
    .append(hanlp.load('MSRA_NER_ELECTRA_SMALL_ZH'), output_key='ner', input_key='tok') \
    .append(hanlp.load('CTB9_DEP_ELECTRA_SMALL', conll=False), output_key='dep', input_key='tok') \
    .append(hanlp.load('CTB9_CON_ELECTRA_SMALL'), output_key='con', input_key='tok')

doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。')
print(doc)
doc.pretty_print()


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_pos_dict.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-15 22:26
import hanlp
from hanlp.components.mtl.multi_task_learning import MultiTaskLearning
from hanlp.components.mtl.tasks.pos import TransformerTagging
from hanlp.components.mtl.tasks.tok.tag_tok import TaggingTokenization
from tests import cdroot

cdroot()
HanLP: MultiTaskLearning = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)

# Demonstrates custom dict in part-of-speech tagging
pos: TransformerTagging = HanLP['pos/ctb']

print(f'自定义单个词性:')
pos.dict_tags = {'HanLP': 'state-of-the-art-tool'}
HanLP("HanLP为生产环境带来次世代最先进的多语种NLP技术。", tasks='pos/ctb').pretty_print()

print(f'根据上下文自定义词性:')
pos.dict_tags = {('的', '希望'): ('补语成分', '名词'), '希望': '动词'}
HanLP("我的希望是希望张晚霞的背影被晚霞映红。", tasks='pos/ctb').pretty_print()

# 需要算法基础才能理解，初学者可参考 http://nlp.hankcs.com/book.php
# See also https://hanlp.hankcs.com/docs/api/hanlp/components/taggers/transformer_tagger.html


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_sts.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-24 13:15
import hanlp

sim = hanlp.load(hanlp.pretrained.sts.STS_ELECTRA_BASE_ZH)
print(sim([
    ['看图猜一电影名', '看图猜电影'],
    ['无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'],
    ['北京到上海的动车票', '上海到北京的动车票'],
]))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_word2vec.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-12 18:33
import hanlp
import torch

word2vec = hanlp.load(hanlp.pretrained.word2vec.CONVSEG_W2V_NEWS_TENSITE_WORD_PKU)
vec = word2vec('先进')
print(vec)

print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('优秀'), dim=0))
print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('水果'), dim=0))

print('获取语义最相似的词语：')
print(word2vec.most_similar('上海'))
# print(word2vec.most_similar(['上海', '寒冷'])) # batching更快

print('非常寒冷是OOV所以无法获取：')
print(word2vec.most_similar('非常寒冷'))
print('但是在doc2vec模式下OOV也可以进行相似度计算：')
print(word2vec.most_similar('非常寒冷', doc2vec=True))
print('甚至可以处理短文本：')
print(word2vec.most_similar('国家图书馆推出2022年春节主题活动', doc2vec=True))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/dep_mtl.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/dep_mtl.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fdep_mtl.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n",
       " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n",
       " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n",
       " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n",
       " 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.mtl.ALL # MTL多任务，具体任务见模型名称，语种见名称最后一个字段或相应语料库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EmZDmLn9aGxG"
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存。自然语言处理分为许多任务，分词只是最初级的一个。与其每个任务单独创建一个模型，不如利用HanLP的联合模型一次性完成多个任务："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 依存句法分析\n",
    "任务越少，速度越快。如指定仅执行依存句法分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
   },
   "outputs": [],
   "source": [
    "doc = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'], tasks='dep')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"tok/fine\": [\n",
      "    [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n",
      "    [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]\n",
      "  ],\n",
      "  \"dep\": [\n",
      "    [[6, \"tmod\"], [6, \"nsubj\"], [6, \"prep\"], [5, \"nn\"], [3, \"pobj\"], [0, \"root\"], [8, \"amod\"], [15, \"nn\"], [10, \"advmod\"], [15, \"rcmod\"], [10, \"assm\"], [13, \"nummod\"], [15, \"nn\"], [15, \"nn\"], [6, \"dobj\"], [6, \"punct\"]],\n",
      "    [[2, \"nsubj\"], [0, \"root\"], [4, \"nn\"], [2, \"dobj\"], [2, \"conj\"], [9, \"nn\"], [9, \"nn\"], [9, \"nn\"], [5, \"dobj\"], [2, \"punct\"]]\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`doc['dep']`为句子们的依存句法树列表，第`i`个二元组表示第`i`个单词的`[中心词的下标, 与中心词的依存关系]`。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "wxctCigrTKu-"
   },
   "source": [
    "可视化依存句法树："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Zo08uquCTFSk",
    "outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dep Tree    \tToken    \tRelati\n",
      "────────────\t─────────\t──────\n",
      " ┌─────────►\t2021年    \ttmod  \n",
      " │┌────────►\tHanLPv2.1\tnsubj \n",
      " ││┌─►┌─────\t为        \tprep  \n",
      " │││  │  ┌─►\t生产       \tnn    \n",
      " │││  └─►└──\t环境       \tpobj  \n",
      "┌┼┴┴────────\t带来       \troot  \n",
      "││       ┌─►\t次        \tamod  \n",
      "││  ┌───►└──\t世代       \tnn    \n",
      "││  │    ┌─►\t最        \tadvmod\n",
      "││  │┌──►├──\t先进       \trcmod \n",
      "││  ││   └─►\t的        \tassm  \n",
      "││  ││   ┌─►\t多        \tnummod\n",
      "││  ││┌─►└──\t语种       \tnn    \n",
      "││  │││  ┌─►\tNLP      \tnn    \n",
      "│└─►└┴┴──┴──\t技术       \tdobj  \n",
      "└──────────►\t。        \tpunct \n",
      "\n",
      "Dep Tree    \tTok\tRelat\n",
      "────────────\t───\t─────\n",
      "         ┌─►\t阿婆主\tnsubj\n",
      "┌┬────┬──┴──\t来到 \troot \n",
      "││    │  ┌─►\t北京 \tnn   \n",
      "││    └─►└──\t立方庭\tdobj \n",
      "│└─►┌───────\t参观 \tconj \n",
      "│   │  ┌───►\t自然 \tnn   \n",
      "│   │  │┌──►\t语义 \tnn   \n",
      "│   │  ││┌─►\t科技 \tnn   \n",
      "│   └─►└┴┴──\t公司 \tdobj \n",
      "└──────────►\t。  \tpunct\n"
     ]
    }
   ],
   "source": [
    "doc.pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "转换为CoNLL格式："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2021年\t_\t_\t_\t_\t6\ttmod\t_\t_\n",
      "2\tHanLPv2.1\t_\t_\t_\t_\t6\tnsubj\t_\t_\n",
      "3\t为\t_\t_\t_\t_\t6\tprep\t_\t_\n",
      "4\t生产\t_\t_\t_\t_\t5\tnn\t_\t_\n",
      "5\t环境\t_\t_\t_\t_\t3\tpobj\t_\t_\n",
      "6\t带来\t_\t_\t_\t_\t0\troot\t_\t_\n",
      "7\t次\t_\t_\t_\t_\t8\tamod\t_\t_\n",
      "8\t世代\t_\t_\t_\t_\t15\tnn\t_\t_\n",
      "9\t最\t_\t_\t_\t_\t10\tadvmod\t_\t_\n",
      "10\t先进\t_\t_\t_\t_\t15\trcmod\t_\t_\n",
      "11\t的\t_\t_\t_\t_\t10\tassm\t_\t_\n",
      "12\t多\t_\t_\t_\t_\t13\tnummod\t_\t_\n",
      "13\t语种\t_\t_\t_\t_\t15\tnn\t_\t_\n",
      "14\tNLP\t_\t_\t_\t_\t15\tnn\t_\t_\n",
      "15\t技术\t_\t_\t_\t_\t6\tdobj\t_\t_\n",
      "16\t。\t_\t_\t_\t_\t6\tpunct\t_\t_\n",
      "\n",
      "1\t阿婆主\t_\t_\t_\t_\t2\tnsubj\t_\t_\n",
      "2\t来到\t_\t_\t_\t_\t0\troot\t_\t_\n",
      "3\t北京\t_\t_\t_\t_\t4\tnn\t_\t_\n",
      "4\t立方庭\t_\t_\t_\t_\t2\tdobj\t_\t_\n",
      "5\t参观\t_\t_\t_\t_\t2\tconj\t_\t_\n",
      "6\t自然\t_\t_\t_\t_\t9\tnn\t_\t_\n",
      "7\t语义\t_\t_\t_\t_\t9\tnn\t_\t_\n",
      "8\t科技\t_\t_\t_\t_\t9\tnn\t_\t_\n",
      "9\t公司\t_\t_\t_\t_\t5\tdobj\t_\t_\n",
      "10\t。\t_\t_\t_\t_\t2\tpunct\t_\t_\n"
     ]
    }
   ],
   "source": [
    "print(doc.to_conll())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "XOsWkOqQfzlr"
   },
   "source": [
    "为已分词的句子执行依存句法分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "bLZSTbv_f3OA",
    "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dep Tree   \tToken\tRelati\n",
      "───────────\t─────\t──────\n",
      " ┌────────►\tHanLP\tnsubj \n",
      " │┌─►┌─────\t为    \tprep  \n",
      " ││  │  ┌─►\t生产   \tnn    \n",
      " ││  └─►└──\t环境   \tpobj  \n",
      "┌┼┴────────\t带来   \troot  \n",
      "││  ┌─────►\t次世代  \tnn    \n",
      "││  │   ┌─►\t最    \tadvmod\n",
      "││  │┌─►├──\t先进   \trcmod \n",
      "││  ││  └─►\t的    \tassm  \n",
      "││  ││ ┌──►\t多语种  \tnn    \n",
      "││  ││ │┌─►\tNLP  \tnn    \n",
      "│└─►└┴─┴┴──\t技术   \tdobj  \n",
      "└─────────►\t。    \tpunct \n",
      "\n",
      "Dep Tree        \tTok\tRelation \n",
      "────────────────\t───\t─────────\n",
      "          ┌─►┌──\t我  \tassmod   \n",
      "          │  └─►\t的  \tassm     \n",
      "       ┌─►└─────\t希望 \ttop      \n",
      "┌┬─────┴────────\t是  \troot     \n",
      "│└─►┌───────────\t希望 \tccomp    \n",
      "│   │     ┌─►┌──\t张晚霞\tassmod   \n",
      "│   │     │  └─►\t的  \tassm     \n",
      "│   │  ┌─►└─────\t背影 \tnsubjpass\n",
      "│   └─►└──┬─────\t被  \tccomp    \n",
      "│         │  ┌─►\t晚霞 \tnsubj    \n",
      "│         └─►└──\t映红 \tdep      \n",
      "└──────────────►\t。  \tpunct    \n"
     ]
    }
   ],
   "source": [
    "HanLP([\n",
    "    [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
    "    [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
    "  ], tasks='dep', skip_tasks='tok*').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 注意\n",
    "Native API的输入单位限定为句子，需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful同时支持全文、句子、已分词的句子。除此之外，RESTful和native两种API的语义设计完全一致，用户可以无缝互换。"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "dep_mtl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/dep_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/dep_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fdep_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EmZDmLn9aGxG"
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 依存句法分析\n",
    "任务越少，速度越快。如指定仅执行依存句法分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
   },
   "outputs": [],
   "source": [
    "doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', tasks='dep')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"tok/fine\": [\n",
      "    [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"]\n",
      "  ],\n",
      "  \"dep\": [\n",
      "    [[6, \"tmod\"], [6, \"nsubj\"], [6, \"prep\"], [5, \"nn\"], [3, \"pobj\"], [0, \"root\"], [8, \"clf\"], [10, \"dep\"], [10, \"advmod\"], [15, \"rcmod\"], [10, \"cpm\"], [13, \"nummod\"], [15, \"nn\"], [15, \"nn\"], [6, \"dobj\"], [6, \"punct\"]]\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`doc['dep']`为句子们的依存句法树列表，第`i`个二元组表示第`i`个单词的`[中心词的下标, 与中心词的依存关系]`。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "wxctCigrTKu-"
   },
   "source": [
    "可视化依存句法树："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Zo08uquCTFSk",
    "outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dep Tree     \tToken    \tRelati\n",
      "─────────────\t─────────\t──────\n",
      "  ┌─────────►\t2021年    \ttmod  \n",
      "  │┌────────►\tHanLPv2.1\tnsubj \n",
      "  ││┌─►┌─────\t为        \tprep  \n",
      "  │││  │  ┌─►\t生产       \tnn    \n",
      "  │││  └─►└──\t环境       \tpobj  \n",
      "┌┬┴┴┴────────\t带来       \troot  \n",
      "││        ┌─►\t次        \tclf   \n",
      "││     ┌─►└──\t世代       \tdep   \n",
      "││     │  ┌─►\t最        \tadvmod\n",
      "││  ┌─►└──┼──\t先进       \trcmod \n",
      "││  │     └─►\t的        \tcpm   \n",
      "││  │     ┌─►\t多        \tnummod\n",
      "││  │  ┌─►└──\t语种       \tnn    \n",
      "││  │  │  ┌─►\tNLP      \tnn    \n",
      "│└─►└──┴──┴──\t技术       \tdobj  \n",
      "└───────────►\t。        \tpunct \n"
     ]
    }
   ],
   "source": [
    "doc.pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "转换为CoNLL格式："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2021年\t_\t_\t_\t_\t6\ttmod\t_\t_\n",
      "2\tHanLPv2.1\t_\t_\t_\t_\t6\tnsubj\t_\t_\n",
      "3\t为\t_\t_\t_\t_\t6\tprep\t_\t_\n",
      "4\t生产\t_\t_\t_\t_\t5\tnn\t_\t_\n",
      "5\t环境\t_\t_\t_\t_\t3\tpobj\t_\t_\n",
      "6\t带来\t_\t_\t_\t_\t0\troot\t_\t_\n",
      "7\t次\t_\t_\t_\t_\t8\tclf\t_\t_\n",
      "8\t世代\t_\t_\t_\t_\t10\tdep\t_\t_\n",
      "9\t最\t_\t_\t_\t_\t10\tadvmod\t_\t_\n",
      "10\t先进\t_\t_\t_\t_\t15\trcmod\t_\t_\n",
      "11\t的\t_\t_\t_\t_\t10\tcpm\t_\t_\n",
      "12\t多\t_\t_\t_\t_\t13\tnummod\t_\t_\n",
      "13\t语种\t_\t_\t_\t_\t15\tnn\t_\t_\n",
      "14\tNLP\t_\t_\t_\t_\t15\tnn\t_\t_\n",
      "15\t技术\t_\t_\t_\t_\t6\tdobj\t_\t_\n",
      "16\t。\t_\t_\t_\t_\t6\tpunct\t_\t_\n"
     ]
    }
   ],
   "source": [
    "print(doc.to_conll())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "XOsWkOqQfzlr"
   },
   "source": [
    "为已分词的句子执行依存句法分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "bLZSTbv_f3OA",
    "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dep Tree   \tToken\tRelati\n",
      "───────────\t─────\t──────\n",
      " ┌────────►\tHanLP\tnsubj \n",
      " │┌─►┌─────\t为    \tprep  \n",
      " ││  │  ┌─►\t生产   \tnn    \n",
      " ││  └─►└──\t环境   \tpobj  \n",
      "┌┼┴────────\t带来   \troot  \n",
      "││     ┌──►\t次世代  \tdep   \n",
      "││     │┌─►\t最    \tadvmod\n",
      "││  ┌─►└┼──\t先进   \trcmod \n",
      "││  │   └─►\t的    \tcpm   \n",
      "││  │  ┌──►\t多语种  \tnn    \n",
      "││  │  │┌─►\tNLP  \tnn    \n",
      "│└─►└──┴┴──\t技术   \tdobj  \n",
      "└─────────►\t。    \tpunct \n",
      "\n",
      "Dep Tree        \tTok\tRelation \n",
      "────────────────\t───\t─────────\n",
      "          ┌─►┌──\t我  \tassmod   \n",
      "          │  └─►\t的  \tassm     \n",
      "       ┌─►└─────\t希望 \ttop      \n",
      "┌┬─────┴────────\t是  \troot     \n",
      "│└─►┌───────────\t希望 \tccomp    \n",
      "│   │     ┌─►┌──\t张晚霞\tassmod   \n",
      "│   │     │  └─►\t的  \tassm     \n",
      "│   │  ┌─►└─────\t背影 \tnsubjpass\n",
      "│   └─►└──┬─────\t被  \tccomp    \n",
      "│         │  ┌─►\t晚霞 \tnsubj    \n",
      "│         └─►└──\t映红 \tdep      \n",
      "└──────────────►\t。  \tpunct    \n"
     ]
    }
   ],
   "source": [
    "HanLP(tokens=[\n",
    "    [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
    "    [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
    "  ], tasks='dep').pretty_print()"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "dep_restful.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/dep_stl.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/dep_stl.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fdep_stl.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "4M7ka0K5OMWU",
    "outputId": "69cdad22-d94d-41fb-9591-1c29515a3da9"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'CTB5_BIAFFINE_DEP_ZH': 'https://file.hankcs.com/hanlp/dep/biaffine_ctb5_20191229_025833.zip',\n",
       " 'CTB7_BIAFFINE_DEP_ZH': 'https://file.hankcs.com/hanlp/dep/biaffine_ctb7_20200109_022431.zip',\n",
       " 'CTB9_DEP_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/dep/ctb9_dep_electra_small_20220216_100306.zip',\n",
       " 'PMT1_DEP_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/dep/pmt_dep_electra_small_20220218_134518.zip',\n",
       " 'CTB9_UDC_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/dep/udc_dep_electra_small_20220218_095452.zip',\n",
       " 'PTB_BIAFFINE_DEP_EN': 'https://file.hankcs.com/hanlp/dep/ptb_dep_biaffine_20200101_174624.zip'}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.dep.ALL # 语种见名称最后一个字段或相应语料库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "BMW528wGNulM"
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [],
   "source": [
    "dep = hanlp.load(hanlp.pretrained.dep.CTB9_DEP_ELECTRA_SMALL)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 依存句法分析\n",
    "依存句法分析任务的输入为已分词的一个或多个句子："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "id": "BqEmDMGGOtk3"
   },
   "outputs": [],
   "source": [
    "tree = dep([\"2021年\", \"HanLPv2.1\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "jj1Jk-2sPHYx"
   },
   "source": [
    "返回对象为[CoNLLSentence](https://hanlp.hankcs.com/docs/api/common/conll.html#hanlp_common.conll.CoNLLSentence)类型："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "U_PGm06m6K20",
    "outputId": "a25c6452-5032-42b3-d501-99158380c487"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'id': 1,\n",
       "  'form': '2021年',\n",
       "  'cpos': None,\n",
       "  'pos': None,\n",
       "  'head': 3,\n",
       "  'deprel': 'tmod',\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'phead': None,\n",
       "  'pdeprel': None},\n",
       " {'id': 2,\n",
       "  'form': 'HanLPv2.1',\n",
       "  'cpos': None,\n",
       "  'pos': None,\n",
       "  'head': 3,\n",
       "  'deprel': 'nsubj',\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'phead': None,\n",
       "  'pdeprel': None},\n",
       " {'id': 3,\n",
       "  'form': '带来',\n",
       "  'cpos': None,\n",
       "  'pos': None,\n",
       "  'head': 0,\n",
       "  'deprel': 'root',\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'phead': None,\n",
       "  'pdeprel': None},\n",
       " {'id': 4,\n",
       "  'form': '次',\n",
       "  'cpos': None,\n",
       "  'pos': None,\n",
       "  'head': 5,\n",
       "  'deprel': 'det',\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'phead': None,\n",
       "  'pdeprel': None},\n",
       " {'id': 5,\n",
       "  'form': '世代',\n",
       "  'cpos': None,\n",
       "  'pos': None,\n",
       "  'head': 7,\n",
       "  'deprel': 'dep',\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'phead': None,\n",
       "  'pdeprel': None},\n",
       " {'id': 6,\n",
       "  'form': '最',\n",
       "  'cpos': None,\n",
       "  'pos': None,\n",
       "  'head': 7,\n",
       "  'deprel': 'advmod',\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'phead': None,\n",
       "  'pdeprel': None},\n",
       " {'id': 7,\n",
       "  'form': '先进',\n",
       "  'cpos': None,\n",
       "  'pos': None,\n",
       "  'head': 12,\n",
       "  'deprel': 'rcmod',\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'phead': None,\n",
       "  'pdeprel': None},\n",
       " {'id': 8,\n",
       "  'form': '的',\n",
       "  'cpos': None,\n",
       "  'pos': None,\n",
       "  'head': 7,\n",
       "  'deprel': 'cpm',\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'phead': None,\n",
       "  'pdeprel': None},\n",
       " {'id': 9,\n",
       "  'form': '多',\n",
       "  'cpos': None,\n",
       "  'pos': None,\n",
       "  'head': 10,\n",
       "  'deprel': 'nummod',\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'phead': None,\n",
       "  'pdeprel': None},\n",
       " {'id': 10,\n",
       "  'form': '语种',\n",
       "  'cpos': None,\n",
       "  'pos': None,\n",
       "  'head': 12,\n",
       "  'deprel': 'nn',\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'phead': None,\n",
       "  'pdeprel': None},\n",
       " {'id': 11,\n",
       "  'form': 'NLP',\n",
       "  'cpos': None,\n",
       "  'pos': None,\n",
       "  'head': 12,\n",
       "  'deprel': 'nn',\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'phead': None,\n",
       "  'pdeprel': None},\n",
       " {'id': 12,\n",
       "  'form': '技术',\n",
       "  'cpos': None,\n",
       "  'pos': None,\n",
       "  'head': 3,\n",
       "  'deprel': 'dobj',\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'phead': None,\n",
       "  'pdeprel': None},\n",
       " {'id': 13,\n",
       "  'form': '。',\n",
       "  'cpos': None,\n",
       "  'pos': None,\n",
       "  'head': 3,\n",
       "  'deprel': 'punct',\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'phead': None,\n",
       "  'pdeprel': None}]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tree"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Gn_RQa_Z6K20"
   },
   "source": [
    "打印时为CoNLL格式："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "26P1LGzv6K20",
    "outputId": "c78ffdb0-3cd7-492d-f55e-0d50120faffb"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2021年\t_\t_\t_\t_\t3\ttmod\t_\t_\n",
      "2\tHanLPv2.1\t_\t_\t_\t_\t3\tnsubj\t_\t_\n",
      "3\t带来\t_\t_\t_\t_\t0\troot\t_\t_\n",
      "4\t次\t_\t_\t_\t_\t5\tdet\t_\t_\n",
      "5\t世代\t_\t_\t_\t_\t7\tdep\t_\t_\n",
      "6\t最\t_\t_\t_\t_\t7\tadvmod\t_\t_\n",
      "7\t先进\t_\t_\t_\t_\t12\trcmod\t_\t_\n",
      "8\t的\t_\t_\t_\t_\t7\tcpm\t_\t_\n",
      "9\t多\t_\t_\t_\t_\t10\tnummod\t_\t_\n",
      "10\t语种\t_\t_\t_\t_\t12\tnn\t_\t_\n",
      "11\tNLP\t_\t_\t_\t_\t12\tnn\t_\t_\n",
      "12\t技术\t_\t_\t_\t_\t3\tdobj\t_\t_\n",
      "13\t。\t_\t_\t_\t_\t3\tpunct\t_\t_\n"
     ]
    }
   ],
   "source": [
    "print(tree)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "如果不需要CoNLL格式的话，也许`conll=False`时的输出更加简洁："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(3, 'tmod'),\n",
       " (3, 'nsubj'),\n",
       " (0, 'root'),\n",
       " (5, 'det'),\n",
       " (7, 'dep'),\n",
       " (7, 'advmod'),\n",
       " (12, 'rcmod'),\n",
       " (7, 'cpm'),\n",
       " (10, 'nummod'),\n",
       " (12, 'nn'),\n",
       " (12, 'nn'),\n",
       " (3, 'dobj'),\n",
       " (3, 'punct')]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dep([\"2021年\", \"HanLPv2.1\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"], conll=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 可视化\n",
    "你可以构造一个`Document`实现漂亮的可视化："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Dep&nbsp;Tree&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌──►&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│┌─►&nbsp;<br>┌┬───────┴┴──&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►└──&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;┌─►&nbsp;<br>││&nbsp;&nbsp;┌─►└──┼──&nbsp;<br>││&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;└─►&nbsp;<br>││&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>││&nbsp;&nbsp;│&nbsp;&nbsp;┌─►└──&nbsp;<br>││&nbsp;&nbsp;│&nbsp;&nbsp;│&nbsp;&nbsp;┌─►&nbsp;<br>│└─►└──┴──┴──&nbsp;<br>└───────────►&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Token&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Relati<br>──────<br>tmod&nbsp;&nbsp;<br>nsubj&nbsp;<br>root&nbsp;&nbsp;<br>det&nbsp;&nbsp;&nbsp;<br>dep&nbsp;&nbsp;&nbsp;<br>advmod<br>rcmod&nbsp;<br>cpm&nbsp;&nbsp;&nbsp;<br>nummod<br>nn&nbsp;&nbsp;&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;<br>dobj&nbsp;&nbsp;<br>punct&nbsp;</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from hanlp_common.document import Document\n",
    "doc = Document(\n",
    "    tok=[\"2021年\", \"HanLPv2.1\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n",
    "    dep=[(3, 'tmod'), (3, 'nsubj'), (0, 'root'), (5, 'det'), (7, 'dep'), (7, 'advmod'), (12, 'rcmod'), (7, 'cpm'), (10, 'nummod'), (12, 'nn'), (12, 'nn'), (3, 'dobj'), (3, 'punct')]\n",
    ")\n",
    "doc.pretty_print()"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "dep_stl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/extractive_summarization_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/extractive_summarization_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fextractive_summarization_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "4M7ka0K5OMWU",
    "outputId": "d74f0749-0587-454a-d7c9-7418d45ce534"
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "BMW528wGNulM"
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 抽取式自动摘要\n",
    "抽取式自动摘要的目标是从文章中筛选出一些作为摘要的中心句子：既要紧扣要点，又要避免赘语。\n",
    "### 中文\n",
    "抽取式自动摘要任务的输入为一段文本和所需的摘要句子数量的最大值`topk`："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "936d439a-e1ff-4308-d2aa-775955558594"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'据DigiTimes报道，在上海疫情趋缓，防疫管控开始放松后，苹果供应商广达正在逐步恢复其中国工厂的MacBook产品生产。': 0.9999685883522034,\n",
       " '仍有许多苹果笔记本用户在等待3月和4月订购的MacBook Pro机型到货，由于苹果的供应问题，他们的发货时间被大大推迟了。': 0.5798477530479431,\n",
       " '尽管MacBook Pro的生产逐渐恢复，但供应问题预计依然影响2022年第三季度的产品销售。': 0.5435440540313721}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = '''\n",
    "据DigiTimes报道，在上海疫情趋缓，防疫管控开始放松后，苹果供应商广达正在逐步恢复其中国工厂的MacBook产品生产。\n",
    "据供应链消息人士称，生产厂的订单拉动情况正在慢慢转强，这会提高MacBook Pro机型的供应量，并缩短苹果客户在过去几周所经历的延长交货时间。\n",
    "仍有许多苹果笔记本用户在等待3月和4月订购的MacBook Pro机型到货，由于苹果的供应问题，他们的发货时间被大大推迟了。\n",
    "据分析师郭明錤表示，广达是高端MacBook Pro的唯一供应商，自防疫封控依赖，MacBook Pro大部分型号交货时间增加了三到五周，\n",
    "一些高端定制型号的MacBook Pro配置要到6月底到7月初才能交货。\n",
    "尽管MacBook Pro的生产逐渐恢复，但供应问题预计依然影响2022年第三季度的产品销售。\n",
    "苹果上周表示，防疫措施和元部件短缺将继续使其难以生产足够的产品来满足消费者的强劲需求，这最终将影响苹果6月份的收入。\n",
    "'''\n",
    "HanLP.extractive_summarization(text, topk=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "jj1Jk-2sPHYx"
   },
   "source": [
    "返回值为最多`topk`个摘要句子以及相应的权重，权重取值区间为$[0, 1]$。由于Trigram Blocking技巧，实际返回的摘要句数量可能小于`topk`。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 可视化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<span style=\"background-color:rgba(255, 255, 0, 0.9999685883522034);\">据DigiTimes报道，在上海疫情趋缓，防疫管控开始放松后，苹果供应商广达正在逐步恢复其中国工厂的MacBook产品生产。</span>\n",
       "据供应链消息人士称，生产厂的订单拉动情况正在慢慢转强，这会提高MacBook Pro机型的供应量，并缩短苹果客户在过去几周所经历的延长交货时间。\n",
       "<span style=\"background-color:rgba(255, 255, 0, 0.5798477530479431);\">仍有许多苹果笔记本用户在等待3月和4月订购的MacBook Pro机型到货，由于苹果的供应问题，他们的发货时间被大大推迟了。</span>\n",
       "据分析师郭明錤表示，广达是高端MacBook Pro的唯一供应商，自防疫封控依赖，MacBook Pro大部分型号交货时间增加了三到五周，\n",
       "一些高端定制型号的MacBook Pro配置要到6月底到7月初才能交货。\n",
       "<span style=\"background-color:rgba(255, 255, 0, 0.5435440540313721);\">尽管MacBook Pro的生产逐渐恢复，但供应问题预计依然影响2022年第三季度的产品销售。</span>\n",
       "<span style=\"background-color:rgba(255, 255, 0, 0.17781692743301392);\">苹果上周表示，防疫措施和元部件短缺将继续使其难以生产足够的产品来满足消费者的强劲需求，这最终将影响苹果6月份的收入。</span>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "def highlight(text, scores):\n",
    "    for k, v in scores.items():\n",
    "        text = text.replace(k, f'<span style=\"background-color:rgba(255, 255, 0, {v});\">{k}</span>')\n",
    "    from IPython.display import display, HTML\n",
    "    display(HTML(text))\n",
    "\n",
    "scores = HanLP.extractive_summarization(text, topk=100)\n",
    "highlight(text, scores)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 繁体中文\n",
    "HanLP的抽取式自动摘要接口支持繁体中文："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'華爾街日報周二（3日）報導，根據知情人透露，日前已宣布將以440億美元買下推特（Twitter）並下市的馬斯克，曾經跟一些潛在投資人說，他可以在短短幾年後，再將這家社群媒體公司重新上市。': 0.9999818205833435,\n",
       " '消息來源說，特斯拉創辦人兼執行長馬斯克表示，他計劃在買下推特後最短三年內，就展開推特的首次公開發行股票。': 0.503434419631958,\n",
       " '根據之前華爾街日報的報導，馬斯克為購買推特籌現金時，與私募股權公司等投資人討論出資事宜，Apollo Global Management有興趣參與。': 0.2688594460487366}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = '''\n",
    "華爾街日報周二（3日）報導，根據知情人透露，日前已宣布將以440億美元買下推特（Twitter）並下市的馬斯克，曾經跟一些潛在投資人說，他可以在短短幾年後，再將這家社群媒體公司重新上市。\n",
    "消息來源說，特斯拉創辦人兼執行長馬斯克表示，他計劃在買下推特後最短三年內，就展開推特的首次公開發行股票。\n",
    "馬斯克買推特的交易案預期在今年稍後走完程序，包括獲得股東同意以及監管機關核准等步驟。\n",
    "根據之前華爾街日報的報導，馬斯克為購買推特籌現金時，與私募股權公司等投資人討論出資事宜，Apollo Global Management有興趣參與。\n",
    "私募股權公司通常都先買下公司將之私有化，把公司移出眾人注目的焦點之外以後，整頓公司，接著再把公司上市，時間常是五年左右。\n",
    "華爾街日報指出，馬斯克暗示他對推特有類似的規劃的話，有助說服潛在投資人，他會很快行動，改善推特的營運和獲利。\n",
    "'''\n",
    "scores = HanLP.extractive_summarization(text)\n",
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<span style=\"background-color:rgba(255, 255, 0, 0.9999818205833435);\">華爾街日報周二（3日）報導，根據知情人透露，日前已宣布將以440億美元買下推特（Twitter）並下市的馬斯克，曾經跟一些潛在投資人說，他可以在短短幾年後，再將這家社群媒體公司重新上市。</span>\n",
       "<span style=\"background-color:rgba(255, 255, 0, 0.503434419631958);\">消息來源說，特斯拉創辦人兼執行長馬斯克表示，他計劃在買下推特後最短三年內，就展開推特的首次公開發行股票。</span>\n",
       "馬斯克買推特的交易案預期在今年稍後走完程序，包括獲得股東同意以及監管機關核准等步驟。\n",
       "<span style=\"background-color:rgba(255, 255, 0, 0.2688594460487366);\">根據之前華爾街日報的報導，馬斯克為購買推特籌現金時，與私募股權公司等投資人討論出資事宜，Apollo Global Management有興趣參與。</span>\n",
       "私募股權公司通常都先買下公司將之私有化，把公司移出眾人注目的焦點之外以後，整頓公司，接著再把公司上市，時間常是五年左右。\n",
       "華爾街日報指出，馬斯克暗示他對推特有類似的規劃的話，有助說服潛在投資人，他會很快行動，改善推特的營運和獲利。\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "highlight(text, scores)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 英文\n",
    "按照HanLP一贯的多语种设计，任何语言都支持。由于服务器GPU资源限制，目前英文接口暂未上线。如果你有相应需求，欢迎前往论坛发起请愿。"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "extractive_summarization_restful.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}

================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/gec_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/gec_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fgec_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EmZDmLn9aGxG"
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 语法纠错\n",
    "输入短文本，执行语法纠错："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['每个青年都应当有远大的抱负。', '有的同学对语言很有兴趣。', '我市本地居民约占全市人口的70%。']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.grammatical_error_correction(['每个青年都应当有远大的报复。', '有的同学对语言很兴趣。', '我市本地居民约占全市人口的70%多。'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "返回值是每段短文本的修改结果列表。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 测试版\n",
    "当前版本为测试版，暂时仅支持拼写、标点和简单的语法错误，HanLP的线上模型和语料库仍然在迭代发展中。欢迎广大用户将测试版的问题反馈到[论坛](https://bbs.hankcs.com/c/text-generation/gec/30)，我们将在下一个版本中，将HanLP的文本纠错能力提升到高考语文水平。"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "gec_restful.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}

================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/keyphrase_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/keyphrase_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fkeyphrase_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "4M7ka0K5OMWU",
    "outputId": "d74f0749-0587-454a-d7c9-7418d45ce534"
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "BMW528wGNulM"
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 关键词提取\n",
    "关键词（短语）提取的目标是文本中最具有代表性的关键词以及短语。\n",
    "### 中文\n",
    "关键词提取任务的输入为一段文本和所需的关键词数量`topk`："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "936d439a-e1ff-4308-d2aa-775955558594"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'自然语言处理': 0.800000011920929,\n",
       " 'HanLP的全部性能': 0.5256577134132385,\n",
       " '一门博大精深的学科': 0.42154020071029663}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.keyphrase_extraction('自然语言处理是一门博大精深的学科，掌握理论才能发挥出HanLP的全部性能。 '\n",
    "                           '《自然语言处理入门》是一本配套HanLP的NLP入门书，助你零起点上手自然语言处理。', topk=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "jj1Jk-2sPHYx"
   },
   "source": [
    "返回值为`topk`个关键词以及相应的权重，权重取值区间为$[0, 1]$。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "关键词提取并不仅限于短文本，长文章也一样支持："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'新冠病毒核酸阳性感染': 0.888239324092865,\n",
       " '确诊病例': 0.8868124485015869,\n",
       " '本土无症状感染者': 0.8557102680206299,\n",
       " '属地社区（村屯）': 0.8164600133895874,\n",
       " '疫情防控工作': 0.7749382853507996,\n",
       " '我市疫情防控要求': 0.7502512335777283,\n",
       " '症状': 0.669366180896759,\n",
       " '我市疫情形势': 0.6673010587692261,\n",
       " '感染': 0.6663177013397217,\n",
       " '本土确诊病例': 0.6464788317680359}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "doc = '''\n",
    "4月15日0-24时，长春市新增本土确诊病例157例（含57例无症状感染者转为确诊病例），新增本土无症状感染者407例。\n",
    "以上人员均为隔离管控期间筛查新冠病毒核酸阳性感染者。\n",
    "当前我市疫情形势严峻，为做好全市疫情防控工作，尽快恢复正常社会秩序和经济社会发展，长春市新冠肺炎疫情防控工作领导小组办公室提醒广大市民，\n",
    "请严格遵守我市疫情防控要求，配合各部门落实好防控措施，进一步提高防范意识，坚持规范戴口罩、勤洗手、常通风、保持社交距离、不聚餐、不聚集，\n",
    "减少疾病感染风险。一旦出现发热、干咳、乏力、咽痛、嗅味觉减退或丧失等不适症状，应及时向属地社区（村屯）或疾控机构报告。\n",
    "'''\n",
    "HanLP.keyphrase_extraction(doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 可视化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "4月15日0-24时，长春市新增本土<span style=\"background-color:rgba(255, 255, 0, 0.8868124485015869);\">确诊病例</span>157例（含57例无<span style=\"background-color:rgba(255, 255, 0, 0.669366180896759);\">症状</span><span style=\"background-color:rgba(255, 255, 0, 0.6663177013397217);\">感染</span>者转为<span style=\"background-color:rgba(255, 255, 0, 0.8868124485015869);\">确诊病例</span>），新增<span style=\"background-color:rgba(255, 255, 0, 0.8557102680206299);\">本土无<span style=\"background-color:rgba(255, 255, 0, 0.669366180896759);\">症状</span><span style=\"background-color:rgba(255, 255, 0, 0.6663177013397217);\">感染</span>者</span>407例。\n",
       "以上人员均为隔离管控期间筛查<span style=\"background-color:rgba(255, 255, 0, 0.888239324092865);\">新冠病毒核酸阳性<span style=\"background-color:rgba(255, 255, 0, 0.6663177013397217);\">感染</span></span>者。\n",
       "当前<span style=\"background-color:rgba(255, 255, 0, 0.6673010587692261);\">我市疫情形势</span>严峻，为做好全市<span style=\"background-color:rgba(255, 255, 0, 0.7749382853507996);\">疫情防控工作</span>，尽快恢复正常社会秩序和经济社会发展，长春市新冠肺炎<span style=\"background-color:rgba(255, 255, 0, 0.7749382853507996);\">疫情防控工作</span>领导小组办公室提醒广大市民，\n",
       "请严格遵守<span style=\"background-color:rgba(255, 255, 0, 0.7502512335777283);\">我市疫情防控要求</span>，配合各部门落实好防控措施，进一步提高防范意识，坚持规范戴口罩、勤洗手、常通风、保持社交距离、不聚餐、不聚集，\n",
       "减少疾病<span style=\"background-color:rgba(255, 255, 0, 0.6663177013397217);\">感染</span>风险。一旦出现发热、干咳、乏力、咽痛、嗅味觉减退或丧失等不适<span style=\"background-color:rgba(255, 255, 0, 0.669366180896759);\">症状</span>，应及时向<span style=\"background-color:rgba(255, 255, 0, 0.8164600133895874);\">属地社区（村屯）</span>或疾控机构报告。\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "def highlight(text, scores):\n",
    "    for k, v in scores.items():\n",
    "        text = text.replace(k, f'<span style=\"background-color:rgba(255, 255, 0, {v});\">{k}</span>')\n",
    "    from IPython.display import display, HTML\n",
    "    display(HTML(text))\n",
    "\n",
    "scores = HanLP.keyphrase_extraction(doc)\n",
    "highlight(doc, scores)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 英文\n",
    "按照HanLP一贯的多语种设计，任何语言都支持。由于服务器GPU资源限制，目前英文接口暂未上线。如果你有相应需求，欢迎前往论坛发起请愿。"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "keyphrase_restful.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/lid_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/lid_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Flid_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "nf9TgeCTC0OT"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "jaW4eu6kC0OU",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "!pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_xI_bLAaC0OU"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "IYwV-UkNNzFp",
    "outputId": "54065443-9b0a-444c-f6c0-c701bc86400b",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "1Uf_u7ddMhUt",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 语种识别\n",
    "语种识别任务的输入为一个或多个文档："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "BqEmDMGGOtk3"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'en'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.language_identification('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "SwaPn1hjC0OW"
   },
   "source": [
    "返回对象为[ISO 639-1编码](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)。HanLP支持返回语种对应的概率（置信度）："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "egpWwHKxC0OX",
    "outputId": "f7c77687-dd75-4fa2-dbd2-be6bda8a3fff"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ja', 0.9976244568824768]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.language_identification('2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "kq_j5TLFC0OX"
   },
   "source": [
    "HanLP也支持返回概率最高的`topk`个语种："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "isJhzYyIC0OX",
    "outputId": "683c8489-dffc-426e-f95b-e91dfb373260"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['zh', 'ja']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.language_identification('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "该功能对于混合了多个语种的文档而言特别实用："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'zh': 0.3952908217906952,\n",
       " 'en': 0.37189167737960815,\n",
       " 'ja': 0.056213412433862686}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = '''\n",
    "2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。\n",
    "In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.\n",
    "'''\n",
    "\n",
    "HanLP.language_identification(text, topk=3, prob=True)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "lid_restful.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}

================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/lid_stl.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/lid_stl.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Flid_stl.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "nf9TgeCTC0OT"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "jaW4eu6kC0OU",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "!pip install hanlp[fasttext] -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_xI_bLAaC0OU"
   },
   "source": [
    "## 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "IYwV-UkNNzFp",
    "outputId": "54065443-9b0a-444c-f6c0-c701bc86400b",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'CHNSENTICORP_BERT_BASE_ZH': 'https://file.hankcs.com/hanlp/classification/chnsenticorp_bert_base_20211228_163210.zip',\n",
       " 'SST2_ALBERT_BASE_EN': 'https://file.hankcs.com/hanlp/classification/sst2_albert_base_20211228_164917.zip',\n",
       " 'LID_176_FASTTEXT_BASE': 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin',\n",
       " 'LID_176_FASTTEXT_SMALL': 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.classifiers.ALL # 任务见第一个字段"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "1Uf_u7ddMhUt",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "pp-1KqEOOJ4t",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
     ]
    }
   ],
   "source": [
    "lid = hanlp.load('LID_176_FASTTEXT_BASE')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 语种识别\n",
    "语种识别任务的输入为一个或多个文档："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "id": "BqEmDMGGOtk3"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'en'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lid('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "SwaPn1hjC0OW"
   },
   "source": [
    "返回对象为[ISO 639-1编码](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)。HanLP支持返回语种对应的概率（置信度）："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "egpWwHKxC0OX",
    "outputId": "f7c77687-dd75-4fa2-dbd2-be6bda8a3fff"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('ja', 0.9976244568824768)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lid('2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "kq_j5TLFC0OX"
   },
   "source": [
    "HanLP也支持返回概率最高的`topk`个语种："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "isJhzYyIC0OX",
    "outputId": "683c8489-dffc-426e-f95b-e91dfb373260"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['zh', 'ja']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lid('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "该功能对于混合了多个语种的文档而言特别实用："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'zh': 0.3952908217906952,\n",
       " 'en': 0.37189167737960815,\n",
       " 'ja': 0.056213412433862686}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = '''\n",
    "2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。\n",
    "In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.\n",
    "'''\n",
    "\n",
    "lid(text, topk=3, prob=True)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "lid_stl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}

================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/ner_mtl.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/ner_mtl.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fner_mtl.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "1Uf_u7ddMhUt",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "pp-1KqEOOJ4t",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "!pip install hanlp -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "0tmKBu7sNAXX",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "## 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "EmZDmLn9aGxG",
    "outputId": "38469cbe-d56c-4648-b103-b67e6d22aeff",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n",
       " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n",
       " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n",
       " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n",
       " 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.mtl.ALL # MTL多任务，具体任务见模型名称，语种见名称最后一个字段或相应语料库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "w0lm87NUsMwW"
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存。自然语言处理分为许多任务，分词只是最初级的一个。与其每个任务单独创建一个模型，不如利用HanLP的联合模型一次性完成多个任务："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "id": "6Evnxsa0sMwW",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "bPUHdNJ-sMwW"
   },
   "source": [
    "## 命名实体识别"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "wxctCigrTKu-"
   },
   "source": [
    "同时执行所有标准的命名实体识别："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Zo08uquCTFSk",
    "outputId": "21be671b-ead0-43c9-cc3a-32c305d8be29"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"tok/fine\": [\n",
      "    [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n",
      "    [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]\n",
      "  ],\n",
      "  \"ner/msra\": [\n",
      "    [[\"2021年\", \"DATE\", 0, 1], [\"HanLPv2.1\", \"WWW\", 1, 2]],\n",
      "    [[\"北京\", \"LOCATION\", 2, 3], [\"立方庭\", \"LOCATION\", 3, 4], [\"自然语义科技公司\", \"ORGANIZATION\", 5, 9]]\n",
      "  ],\n",
      "  \"ner/pku\": [\n",
      "    [],\n",
      "    [[\"北京立方庭\", \"ns\", 2, 4], [\"自然语义科技公司\", \"nt\", 5, 9]]\n",
      "  ],\n",
      "  \"ner/ontonotes\": [\n",
      "    [[\"2021年\", \"DATE\", 0, 1], [\"HanLPv2.1\", \"ORG\", 1, 2]],\n",
      "    [[\"北京立方庭\", \"FAC\", 2, 4], [\"自然语义科技公司\", \"ORG\", 5, 9]]\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'], tasks='ner*'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "每个四元组表示`[命名实体, 类型标签, 起始下标, 终止下标]`，下标指的是命名实体在单词数组中的下标，单词数组默认为第一个以`tok`开头的数组。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "cqEWnj_7p2Lf"
   },
   "source": [
    "任务越少，速度越快。如指定仅执行命名实体识别，默认MSRA标准："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 572
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "33790ca9-7013-456f-c1cb-e5ddce90a457"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token    \tNER Type        \n",
      "─────────\t────────────────\n",
      "2021年    \t───►DATE        \n",
      "HanLPv2.1\t───►WWW         \n",
      "为        \t                \n",
      "生产       \t                \n",
      "环境       \t                \n",
      "带来       \t                \n",
      "次世代      \t───►DATE        \n",
      "最        \t                \n",
      "先进       \t                \n",
      "的        \t                \n",
      "多        \t                \n",
      "语种       \t                \n",
      "NLP      \t                \n",
      "技术       \t                \n",
      "。        \t                \n",
      "阿婆主      \t                \n",
      "来到       \t                \n",
      "北京       \t◄─┐             \n",
      "立方庭      \t◄─┴►ORGANIZATION\n",
      "参观       \t                \n",
      "自然       \t◄─┐             \n",
      "语义       \t  │             \n",
      "科技       \t  ├►ORGANIZATION\n",
      "公司       \t◄─┘             \n",
      "。        \t                \n"
     ]
    }
   ],
   "source": [
    "HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。', tasks='ner').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "jj1Jk-2sPHYx"
   },
   "source": [
    "执行OntoNotes命名实体识别："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 572
    },
    "id": "1goEC7znPNkI",
    "outputId": "2a97331c-a5fb-4d3c-ccf2-ce2186616c57",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token    \tNER Type\n",
      "─────────\t────────\n",
      "2021年    \t───►DATE\n",
      "HanLPv2.1\t───►ORG \n",
      "为        \t        \n",
      "生产       \t        \n",
      "环境       \t        \n",
      "带来       \t        \n",
      "次世代      \t        \n",
      "最        \t        \n",
      "先进       \t        \n",
      "的        \t        \n",
      "多        \t        \n",
      "语种       \t        \n",
      "NLP      \t        \n",
      "技术       \t        \n",
      "。        \t        \n",
      "阿婆主      \t        \n",
      "来到       \t        \n",
      "北京       \t◄─┐     \n",
      "立方庭      \t◄─┴►ORG \n",
      "参观       \t        \n",
      "自然       \t◄─┐     \n",
      "语义       \t  │     \n",
      "科技       \t  ├►ORG \n",
      "公司       \t◄─┘     \n",
      "。        \t        \n"
     ]
    }
   ],
   "source": [
    "HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。', tasks='ner/ontonotes').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 注意\n",
    "Native API的输入单位限定为句子，需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful同时支持全文、句子、已分词的句子。除此之外，RESTful和native两种API的语义设计完全一致，用户可以无缝互换。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "P7CNTDBRsiYa"
   },
   "source": [
    "## 自定义词典"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "ZXtRTXlBsmtw"
   },
   "source": [
    "自定义词典是NER任务的成员变量，要操作自定义词典，先获取一个NER任务。以MSRA为例："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "id": "QgY22h0AszsA"
   },
   "outputs": [],
   "source": [
    "ner = HanLP['ner/msra']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_6fPzuyps98H"
   },
   "source": [
    "### 白名单词典\n",
    "白名单词典中的词语会尽量被输出。当然，HanLP以统计为主，词典的优先级很低。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 321
    },
    "id": "plNDyWhws5qg",
    "outputId": "7120d400-022c-42e9-fca9-febe3745d2c9"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token\tNER Type   \n",
      "─────\t───────────\n",
      "2021年\t───►DATE   \n",
      "测试   \t           \n",
      "高血压  \t           \n",
      "是    \t           \n",
      "138  \t───►INTEGER\n",
      "，    \t           \n",
      "时间   \t           \n",
      "是    \t           \n",
      "午饭   \t◄─┐        \n",
      "后    \t◄─┴►TIME   \n",
      "2点45 \t───►TIME   \n",
      "，    \t           \n",
      "低血压  \t           \n",
      "是    \t           \n",
      "44   \t───►INTEGER\n"
     ]
    }
   ],
   "source": [
    "ner.dict_whitelist = {'午饭后': 'TIME'}\n",
    "doc = HanLP('2021年测试高血压是138，时间是午饭后2点45，低血压是44', tasks='ner/msra')\n",
    "doc.pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "aR_8TICmtw_E"
   },
   "source": [
    "### 强制词典\n",
    "如果你读过[《自然语言处理入门》](http://nlp.hankcs.com/book.php)，你就会理解BMESO标注集，于是你可以直接干预统计模型预测的标签，拿到最高优先级的权限。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 268
    },
    "id": "sWPljj3stsEA",
    "outputId": "99c4c281-a5b6-46bb-dffd-c1722fee7aee"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "To\tNER Type    \n",
      "──\t────────────\n",
      "他 \t            \n",
      "在 \t            \n",
      "浙江\t───►LOCATION\n",
      "金华\t───►LOCATION\n",
      "出生\t            \n",
      "， \t            \n",
      "他 \t            \n",
      "的 \t            \n",
      "名字\t            \n",
      "叫 \t            \n",
      "金华\t───►PERSON  \n",
      "。 \t            \n"
     ]
    }
   ],
   "source": [
    "ner.dict_tags = {('名字', '叫', '金华'): ('O', 'O', 'S-PERSON')}\n",
    "HanLP('他在浙江金华出生，他的名字叫金华。', tasks='ner/msra').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "fkTC0GFxtinZ"
   },
   "source": [
    "### 黑名单词典\n",
    "黑名单中的词语绝对不会被当做命名实体。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 268
    },
    "id": "bIJpgdGauLJK",
    "outputId": "e74ec7ba-00fd-4958-d772-a1d1c40d1033"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "To\tNER Type    \n",
      "──\t────────────\n",
      "他 \t            \n",
      "在 \t            \n",
      "浙江\t───►LOCATION\n",
      "金华\t            \n",
      "出生\t            \n",
      "， \t            \n",
      "他 \t            \n",
      "的 \t            \n",
      "名字\t            \n",
      "叫 \t            \n",
      "金华\t            \n",
      "。 \t            \n"
     ]
    }
   ],
   "source": [
    "ner.dict_blacklist = {'金华'}\n",
    "HanLP('他在浙江金华出生，他的名字叫金华。', tasks='ner/msra').pretty_print()"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "ner_mtl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/ner_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/ner_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fner_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EmZDmLn9aGxG"
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 命名实体识别"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "wxctCigrTKu-"
   },
   "source": [
    "同时执行所有标准的命名实体识别："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Zo08uquCTFSk",
    "outputId": "21be671b-ead0-43c9-cc3a-32c305d8be29"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"tok/fine\": [\n",
      "    [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n",
      "    [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]\n",
      "  ],\n",
      "  \"ner/msra\": [\n",
      "    [[\"2021年\", \"DATE\", 0, 1], [\"HanLPv2.1\", \"ORGANIZATION\", 1, 2]],\n",
      "    [[\"北京立方庭\", \"LOCATION\", 2, 4], [\"自然语义科技公司\", \"ORGANIZATION\", 5, 9]]\n",
      "  ],\n",
      "  \"ner/pku\": [\n",
      "    [],\n",
      "    [[\"北京\", \"ns\", 2, 3], [\"立方庭\", \"ns\", 3, 4], [\"自然语义科技公司\", \"nt\", 5, 9]]\n",
      "  ],\n",
      "  \"ner/ontonotes\": [\n",
      "    [[\"2021年\", \"DATE\", 0, 1], [\"次世代\", \"DATE\", 6, 8]],\n",
      "    [[\"北京\", \"FAC\", 2, 3], [\"立方庭\", \"LOC\", 3, 4], [\"自然语义科技公司\", \"ORG\", 5, 9]]\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。', tasks='ner*'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "每个四元组表示`[命名实体, 类型标签, 起始下标, 终止下标]`，下标指的是命名实体在单词数组中的下标，单词数组默认为第一个以`tok`开头的数组。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "cqEWnj_7p2Lf"
   },
   "source": [
    "任务越少，速度越快。如指定仅执行命名实体识别，默认MSRA标准："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 572
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "33790ca9-7013-456f-c1cb-e5ddce90a457"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token    \tNER Type        \n",
      "─────────\t────────────────\n",
      "2021年    \t───►DATE        \n",
      "HanLPv2.1\t───►ORGANIZATION\n",
      "为        \t                \n",
      "生产       \t                \n",
      "环境       \t                \n",
      "带来       \t                \n",
      "次        \t                \n",
      "世代       \t                \n",
      "最        \t                \n",
      "先进       \t                \n",
      "的        \t                \n",
      "多        \t                \n",
      "语种       \t                \n",
      "NLP      \t                \n",
      "技术       \t                \n",
      "。        \t                \n",
      "\n",
      "Tok\tNER Type        \n",
      "───\t────────────────\n",
      "阿婆主\t                \n",
      "来到 \t                \n",
      "北京 \t◄─┐             \n",
      "立方庭\t◄─┴►LOCATION    \n",
      "参观 \t                \n",
      "自然 \t◄─┐             \n",
      "语义 \t  │             \n",
      "科技 \t  ├►ORGANIZATION\n",
      "公司 \t◄─┘             \n",
      "。  \t                \n"
     ]
    }
   ],
   "source": [
    "HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。', tasks='ner').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "jj1Jk-2sPHYx"
   },
   "source": [
    "执行OntoNotes命名实体识别："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 572
    },
    "id": "1goEC7znPNkI",
    "outputId": "2a97331c-a5fb-4d3c-ccf2-ce2186616c57"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token    \tNER Type\n",
      "─────────\t────────\n",
      "2021年    \t───►DATE\n",
      "HanLPv2.1\t        \n",
      "为        \t        \n",
      "生产       \t        \n",
      "环境       \t        \n",
      "带来       \t        \n",
      "次        \t◄─┐     \n",
      "世代       \t◄─┴►DATE\n",
      "最        \t        \n",
      "先进       \t        \n",
      "的        \t        \n",
      "多        \t        \n",
      "语种       \t        \n",
      "NLP      \t        \n",
      "技术       \t        \n",
      "。        \t        \n",
      "\n",
      "Tok\tNER Typ\n",
      "───\t───────\n",
      "阿婆主\t       \n",
      "来到 \t       \n",
      "北京 \t───►FAC\n",
      "立方庭\t───►LOC\n",
      "参观 \t       \n",
      "自然 \t◄─┐    \n",
      "语义 \t  │    \n",
      "科技 \t  ├►ORG\n",
      "公司 \t◄─┘    \n",
      "。  \t       \n"
     ]
    }
   ],
   "source": [
    "HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。', tasks='ner/ontonotes').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "XOsWkOqQfzlr"
   },
   "source": [
    "为已分词的句子执行命名实体识别："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 161
    },
    "id": "bLZSTbv_f3OA",
    "outputId": "6a0e1e76-f581-4fd1-8a78-ef97d9429e87"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token   \tNER Type        \n",
      "────────\t────────────────\n",
      "阿婆主     \t                \n",
      "来到      \t                \n",
      "北京立方庭   \t───►LOCATION    \n",
      "参观      \t                \n",
      "自然语义科技公司\t───►ORGANIZATION\n",
      "。       \t                \n"
     ]
    }
   ],
   "source": [
    "HanLP(tokens=[[\"阿婆主\", \"来到\", \"北京立方庭\", \"参观\", \"自然语义科技公司\", \"。\"]], tasks='ner').pretty_print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "ner_restful.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/ner_stl.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/ner_stl.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fner_stl.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "1Uf_u7ddMhUt",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "pp-1KqEOOJ4t",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "!pip install hanlp -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "0tmKBu7sNAXX",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "## 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "EmZDmLn9aGxG",
    "outputId": "0d55f7a1-3a4c-4170-e60f-da7473208e3f",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'MSRA_NER_BERT_BASE_ZH': 'https://file.hankcs.com/hanlp/ner/ner_bert_base_msra_20211227_114712.zip',\n",
       " 'MSRA_NER_ALBERT_BASE_ZH': 'https://file.hankcs.com/hanlp/ner/msra_ner_albert_base_20211228_173323.zip',\n",
       " 'MSRA_NER_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/ner/msra_ner_electra_small_20210807_154832.zip',\n",
       " 'CONLL03_NER_BERT_BASE_CASED_EN': 'https://file.hankcs.com/hanlp/ner/ner_conll03_bert_base_cased_en_20211227_121443.zip'}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.ner.ALL # 语种见名称最后一个字段或相应语料库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "VDT-qmLyvDST"
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "id": "Tzu5Qi-xvDST",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "ner = hanlp.load(hanlp.pretrained.ner.MSRA_NER_ELECTRA_SMALL_ZH)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 命名实体识别"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "wxctCigrTKu-"
   },
   "source": [
    "命名实体识别任务的输入为已分词的句子："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Zo08uquCTFSk",
    "outputId": "864da076-7113-4685-e27a-1856e69bdd2a"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[('2021年', 'DATE', 0, 1)], [('北京', 'LOCATION', 2, 3), ('立方庭', 'LOCATION', 3, 4), ('自然语义科技公司', 'ORGANIZATION', 5, 9)]]\n"
     ]
    }
   ],
   "source": [
    "print(ner([[\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"], [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]], tasks='ner*'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "每个四元组表示`[命名实体, 类型标签, 起始下标, 终止下标]`，下标指的是命名实体在单词数组中的下标。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 自定义词典"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "自定义词典是NER任务的成员变量："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "None\n"
     ]
    }
   ],
   "source": [
    "print(ner.dict_whitelist)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 白名单词典\n",
    "白名单词典中的词语会尽量被输出。当然，HanLP以统计为主，词典的优先级很低。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('2021年', 'DATE', 0, 1),\n",
       " ('138', 'INTEGER', 4, 5),\n",
       " ('午饭后', 'TIME', 8, 10),\n",
       " ('2点45', 'TIME', 10, 11),\n",
       " ('44', 'INTEGER', 14, 15)]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ner.dict_whitelist = {'午饭后': 'TIME'}\n",
    "ner(['2021年', '测试', '高血压', '是', '138', '，', '时间', '是', '午饭', '后', '2点45', '，', '低血压', '是', '44'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 强制词典\n",
    "如果你读过[《自然语言处理入门》](http://nlp.hankcs.com/book.php)，你就会理解BMESO标注集，于是你可以直接干预统计模型预测的标签，拿到最高优先级的权限。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('浙江', 'LOCATION', 2, 3), ('金华', 'LOCATION', 3, 4), ('金华', 'PERSON', 10, 11)]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ner.dict_tags = {('名字', '叫', '金华'): ('O', 'O', 'S-PERSON')}\n",
    "ner(['他', '在', '浙江', '金华', '出生', '，', '他', '的', '名字', '叫', '金华', '。'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 黑名单词典\n",
    "黑名单中的词语绝对不会被当做命名实体。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('浙江', 'LOCATION', 2, 3)]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ner.dict_blacklist = {'金华'}\n",
    "ner(['他', '在', '浙江', '金华', '出生', '，', '他', '的', '名字', '叫', '金华', '。'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "ner_stl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}

================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/pos_mtl.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "view-in-github"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/pos_mtl.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fpos_mtl.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "4M7ka0K5OMWU",
    "outputId": "50ad002e-4363-46cd-8f5d-b6d6aad3e957"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n",
       " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n",
       " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n",
       " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n",
       " 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.mtl.ALL # MTL多任务，具体任务见模型名称，语种见名称最后一个字段或相应语料库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "BMW528wGNulM"
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存。自然语言处理分为许多任务，分词只是最初级的一个。与其每个任务单独创建一个模型，不如利用HanLP的联合模型一次性完成多个任务："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [],
   "source": [
    "HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 词性标注\n",
    "任务越少，速度越快。如指定仅执行词性标注，默认CTB标准："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "5ad7fd22-651a-4403-d897-a9492eb15854"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">HanLP/NR&nbsp;为/P&nbsp;生产/NN&nbsp;环境/NN&nbsp;带来/VV&nbsp;次/JJ&nbsp;世代/NN&nbsp;最/AD&nbsp;先进/JJ&nbsp;的/DEG&nbsp;多语种/NN&nbsp;NLP/NR&nbsp;技术/NN&nbsp;。/PU</pre></div><br><div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">我/PN&nbsp;的/DEG&nbsp;希望/NN&nbsp;是/VC&nbsp;希望/VV&nbsp;张晚霞/NR&nbsp;的/DEG&nbsp;背影/NN&nbsp;被/LB&nbsp;晚霞/NN&nbsp;映红/VV&nbsp;。/PU</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "HanLP(['HanLP为生产环境带来次世代最先进的多语种NLP技术。', '我的希望是希望张晚霞的背影被晚霞映红。'], tasks='pos').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "jj1Jk-2sPHYx"
   },
   "source": [
    "注意上面两个“希望”的词性各不相同，一个是名词另一个是动词。\n",
    "执行PKU词性标注："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "1goEC7znPNkI",
    "outputId": "586afd5d-db0d-41bd-f7de-411f37062a8c"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">HanLP/nx&nbsp;为/p&nbsp;生产/vn&nbsp;环境/n&nbsp;带来/v&nbsp;次/b&nbsp;世代/n&nbsp;最/d&nbsp;先进/a&nbsp;的/u&nbsp;多语种/n&nbsp;NLP/nx&nbsp;技术/n&nbsp;。/w</pre></div><br><div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">我/r&nbsp;的/u&nbsp;希望/n&nbsp;是/v&nbsp;希望/v&nbsp;张晚霞/nr&nbsp;的/u&nbsp;背影/n&nbsp;被/p&nbsp;晚霞/n&nbsp;映红/v&nbsp;。/w</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "HanLP(['HanLP为生产环境带来次世代最先进的多语种NLP技术。', '我的希望是希望张晚霞的背影被晚霞映红。'], tasks='pos/pku').pretty_print()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "wxctCigrTKu-"
   },
   "source": [
    "同时执行所有标准的词性标注："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Zo08uquCTFSk",
    "outputId": "d2b3eb65-06e6-47a6-d954-04cae27d6c51"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"tok/fine\": [\n",
      "    [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
      "    [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
      "  ],\n",
      "  \"pos/ctb\": [\n",
      "    [\"NR\", \"P\", \"NN\", \"NN\", \"VV\", \"JJ\", \"NN\", \"AD\", \"JJ\", \"DEG\", \"NN\", \"NR\", \"NN\", \"PU\"],\n",
      "    [\"PN\", \"DEG\", \"NN\", \"VC\", \"VV\", \"NR\", \"DEG\", \"NN\", \"LB\", \"NN\", \"VV\", \"PU\"]\n",
      "  ],\n",
      "  \"pos/pku\": [\n",
      "    [\"nx\", \"p\", \"vn\", \"n\", \"v\", \"b\", \"n\", \"d\", \"a\", \"u\", \"n\", \"nx\", \"n\", \"w\"],\n",
      "    [\"r\", \"u\", \"n\", \"v\", \"v\", \"nr\", \"u\", \"n\", \"p\", \"n\", \"v\", \"w\"]\n",
      "  ],\n",
      "  \"pos/863\": [\n",
      "    [\"w\", \"p\", \"v\", \"n\", \"v\", \"a\", \"nt\", \"d\", \"a\", \"u\", \"n\", \"ws\", \"n\", \"w\"],\n",
      "    [\"r\", \"u\", \"n\", \"vl\", \"v\", \"nh\", \"u\", \"n\", \"p\", \"n\", \"v\", \"w\"]\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(HanLP(['HanLP为生产环境带来次世代最先进的多语种NLP技术。', '我的希望是希望张晚霞的背影被晚霞映红。'], tasks='pos*'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "以`pos`开头的字段为词性，以`tok`开头的第一个数组为单词，两者按下标一一对应。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 注意\n",
    "Native API的输入单位限定为句子，需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful同时支持全文、句子、已分词的句子。除此之外，RESTful和native两种API的语义设计完全一致，用户可以无缝互换。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "suUL042zPpLj"
   },
   "source": [
    "## 自定义词典\n",
    "自定义词典为词性标注任务的成员变量，要操作自定义词典，先获取一个词性标注任务，以CTB标准为例："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "AzYShIssP6kq",
    "outputId": "640cefa5-1d6d-464b-81d2-83c66e2081f2"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<hanlp.components.mtl.tasks.pos.TransformerTagging at 0x160950910>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pos = HanLP['pos/ctb']\n",
    "pos"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "1q4MUpgVQNlu"
   },
   "source": [
    "自定义单个词性："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "id": "2zZkH9tRQOoi",
    "outputId": "ed0bb8fe-2e68-4c58-e11e-ff6a0cc69ae4"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">HanLP/state-of-the-art-tool&nbsp;为/P&nbsp;生产/NN&nbsp;环境/NN&nbsp;带来/VV&nbsp;次/JJ&nbsp;世代/NN&nbsp;最/AD&nbsp;先进/JJ&nbsp;的/DEG&nbsp;多语种/NN&nbsp;NLP/NR&nbsp;技术/NN&nbsp;。/PU</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "pos.dict_tags = {'HanLP': 'state-of-the-art-tool'}\n",
    "HanLP(\"HanLP为生产环境带来次世代最先进的多语种NLP技术。\", tasks='pos/ctb').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "F-9gAeIVQUFG"
   },
   "source": [
    "根据上下文自定义词性："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "id": "F8M8cyBrQduw",
    "outputId": "16ef7f82-50ff-478f-c3ea-8e768b0cea31"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">我/PN&nbsp;的/补语成分&nbsp;希望/名词&nbsp;是/VC&nbsp;希望/动词&nbsp;张晚霞/NR&nbsp;的/DEG&nbsp;背影/NN&nbsp;被/LB&nbsp;晚霞/NN&nbsp;映红/VV&nbsp;。/PU</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "pos.dict_tags = {('的', '希望'): ('补语成分', '名词'), '希望': '动词'}\n",
    "HanLP(\"我的希望是希望张晚霞的背影被晚霞映红。\", tasks='pos/ctb').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "需要算法基础才能理解，初学者可参考[《自然语言处理入门》](http://nlp.hankcs.com/book.php)。"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "include_colab_link": true,
   "name": "pos_mtl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/pos_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/pos_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fpos_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EmZDmLn9aGxG"
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 词性标注\n",
    "任务越少，速度越快。如指定仅执行词性标注，默认CTB标准："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; padding-bottom: 1rem;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap; line-height: 128%; padding: 0;\">HanLP/NR&nbsp;为/P&nbsp;生产/NN&nbsp;环境/NN&nbsp;带来/VV&nbsp;次世代/NN&nbsp;最/AD&nbsp;先进/JJ&nbsp;的/DEG&nbsp;多/CD&nbsp;语种/NN&nbsp;NLP/NN&nbsp;技术/NN&nbsp;。/PU</pre></div><br><div style=\"display: table; padding-bottom: 1rem;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap; line-height: 128%; padding: 0;\">我/PN&nbsp;的/DEG&nbsp;希望/NN&nbsp;是/VC&nbsp;希望/VV&nbsp;张晚霞/NR&nbsp;的/DEG&nbsp;背影/NN&nbsp;被/LB&nbsp;晚霞/NN&nbsp;映红/VV&nbsp;。/PU</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "HanLP('HanLP为生产环境带来次世代最先进的多语种NLP技术。我的希望是希望张晚霞的背影被晚霞映红。', tasks='pos').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "jj1Jk-2sPHYx"
   },
   "source": [
    "注意上面两个“希望”的词性各不相同，一个是名词另一个是动词。\n",
    "\n",
    "### 执行PKU词性标注"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "1goEC7znPNkI",
    "outputId": "7a3fde55-7577-49eb-92c8-48146aaa89d3"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; padding-bottom: 1rem;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap; line-height: 128%; padding: 0;\">HanLP/nx&nbsp;为/p&nbsp;生产/vn&nbsp;环境/n&nbsp;带来/v&nbsp;次世代/n&nbsp;最/d&nbsp;先进/a&nbsp;的/u&nbsp;多/a&nbsp;语种/n&nbsp;NLP/nx&nbsp;技术/n&nbsp;。/w</pre></div><br><div style=\"display: table; padding-bottom: 1rem;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap; line-height: 128%; padding: 0;\">我/r&nbsp;的/u&nbsp;希望/n&nbsp;是/v&nbsp;希望/v&nbsp;张晚霞/nr&nbsp;的/u&nbsp;背影/n&nbsp;被/p&nbsp;晚霞/n&nbsp;映红/v&nbsp;。/w</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "HanLP('HanLP为生产环境带来次世代最先进的多语种NLP技术。我的希望是希望张晚霞的背影被晚霞映红。', tasks='pos/pku').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 执行粗颗粒度分词和PKU词性标注"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; padding-bottom: 1rem;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap; line-height: 128%; padding: 0;\">阿婆主/n&nbsp;来到/v&nbsp;北京立方庭/ns&nbsp;参观/v&nbsp;自然语义科技公司/n&nbsp;。/w</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks=['tok/coarse', 'pos/pku'], skip_tasks='tok/fine').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "举一反三，你可以指定其他pos标注集（ctb、863等）。用户有多聪明，HanLP就有多强大。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "wxctCigrTKu-"
   },
   "source": [
    "### 同时执行所有标准的词性标注"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Zo08uquCTFSk",
    "outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"tok/fine\": [\n",
      "    [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n",
      "    [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
      "  ],\n",
      "  \"pos/ctb\": [\n",
      "    [\"NR\", \"P\", \"NN\", \"NN\", \"VV\", \"NN\", \"AD\", \"JJ\", \"DEG\", \"CD\", \"NN\", \"NN\", \"NN\", \"PU\"],\n",
      "    [\"PN\", \"DEG\", \"NN\", \"VC\", \"VV\", \"NR\", \"DEG\", \"NN\", \"LB\", \"NN\", \"VV\", \"PU\"]\n",
      "  ],\n",
      "  \"pos/pku\": [\n",
      "    [\"nx\", \"p\", \"vn\", \"n\", \"v\", \"n\", \"d\", \"a\", \"u\", \"a\", \"n\", \"nx\", \"n\", \"w\"],\n",
      "    [\"r\", \"u\", \"n\", \"v\", \"v\", \"nr\", \"u\", \"n\", \"p\", \"n\", \"v\", \"w\"]\n",
      "  ],\n",
      "  \"pos/863\": [\n",
      "    [\"w\", \"p\", \"v\", \"n\", \"v\", \"n\", \"d\", \"a\", \"u\", \"a\", \"n\", \"w\", \"n\", \"w\"],\n",
      "    [\"r\", \"u\", \"v\", \"vl\", \"v\", \"nh\", \"u\", \"n\", \"p\", \"n\", \"v\", \"w\"]\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(HanLP('HanLP为生产环境带来次世代最先进的多语种NLP技术。我的希望是希望张晚霞的背影被晚霞映红。', tasks='pos*'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "以`pos`开头的字段为词性，以`tok`开头的第一个数组为单词，两者按下标一一对应。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "XOsWkOqQfzlr"
   },
   "source": [
    "### 为已分词的句子执行词性标注"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "bLZSTbv_f3OA",
    "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; padding-bottom: 1rem;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap; line-height: 128%; padding: 0;\">HanLP/NR&nbsp;为/P&nbsp;生产环境/NN&nbsp;带来/VV&nbsp;次世代/NN&nbsp;最/AD&nbsp;先进/JJ&nbsp;的/DEG&nbsp;多语种/NN&nbsp;NLP/NN&nbsp;技术/NN&nbsp;。/PU</pre></div><br><div style=\"display: table; padding-bottom: 1rem;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap; line-height: 128%; padding: 0;\">我/PN&nbsp;的/DEG&nbsp;希望/NN&nbsp;是/VC&nbsp;希望/VV&nbsp;张晚霞/NR&nbsp;的/DEG&nbsp;背影/NN&nbsp;被/LB&nbsp;晚霞/NN&nbsp;映红/VV&nbsp;。/PU</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "HanLP(tokens=[\n",
    "    [\"HanLP\", \"为\", \"生产环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
    "    [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
    "  ], tasks='pos').pretty_print()"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "pos_restful.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}

================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/pos_stl.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/pos_stl.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fpos_stl.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "4M7ka0K5OMWU",
    "outputId": "d74f0749-0587-454a-d7c9-7418d45ce534"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'CTB5_POS_RNN': 'https://file.hankcs.com/hanlp/pos/ctb5_pos_rnn_20200113_235925.zip',\n",
       " 'CTB5_POS_RNN_FASTTEXT_ZH': 'https://file.hankcs.com/hanlp/pos/ctb5_pos_rnn_fasttext_20191230_202639.zip',\n",
       " 'CTB9_POS_ALBERT_BASE': 'https://file.hankcs.com/hanlp/pos/ctb9_albert_base_20211228_163935.zip',\n",
       " 'CTB9_POS_ELECTRA_SMALL_TF': 'https://file.hankcs.com/hanlp/pos/pos_ctb_electra_small_20211227_121341.zip',\n",
       " 'CTB9_POS_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/pos/pos_ctb_electra_small_20220215_111944.zip',\n",
       " 'CTB9_POS_RADICAL_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/pos/pos_ctb_radical_electra_small_20220215_111932.zip',\n",
       " 'C863_POS_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/pos/pos_863_electra_small_20220217_101958.zip',\n",
       " 'PKU_POS_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/pos/pos_pku_electra_small_20220217_142436.zip',\n",
       " 'PKU98_POS_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/pos/pos_pku_electra_small_20210808_125158.zip',\n",
       " 'PTB_POS_RNN_FASTTEXT_EN': 'https://file.hankcs.com/hanlp/pos/ptb_pos_rnn_fasttext_20200103_145337.zip'}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.pos.ALL # 语种见名称最后一个字段或相应语料库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "BMW528wGNulM"
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "0tmKBu7sNAXX",
    "outputId": "df2de87b-27f5-4c72-8eb2-25ceefdd8270"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading https://file.hankcs.com/hanlp/pos/ctb9_pos_electra_small_20220118_164341.zip to /root/.hanlp/pos/ctb9_pos_electra_small_20220118_164341.zip\n",
      "100%  43.6 MiB  21.2 MiB/s ETA:  0 s [=========================================]\n",
      "Decompressing /root/.hanlp/pos/ctb9_pos_electra_small_20220118_164341.zip to /root/.hanlp/pos\n",
      "Downloading https://file.hankcs.com/hanlp/transformers/electra_zh_small_20210706_125427.zip to /root/.hanlp/transformers/electra_zh_small_20210706_125427.zip\n",
      "100%  41.2 KiB  41.2 KiB/s ETA:  0 s [=========================================]\n",
      "Decompressing /root/.hanlp/transformers/electra_zh_small_20210706_125427.zip to /root/.hanlp/transformers\n"
     ]
    }
   ],
   "source": [
    "pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 词性标注\n",
    "词性标注任务的输入为已分词的一个或多个句子："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "936d439a-e1ff-4308-d2aa-775955558594"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['PN', 'DEG', 'NN', 'VC', 'VV', 'NR', 'DEG', 'NN', 'LB', 'NR', 'VV', 'PU']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pos([\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "jj1Jk-2sPHYx"
   },
   "source": [
    "注意上面两个“希望”的词性各不相同，一个是名词另一个是动词。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "suUL042zPpLj"
   },
   "source": [
    "## 自定义词典\n",
    "自定义词典为词性标注任务的成员变量，以CTB标准为例："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "AzYShIssP6kq",
    "outputId": "99b2607b-b618-4876-bbea-9f8c24859a85"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "None\n"
     ]
    }
   ],
   "source": [
    "print(pos.dict_tags)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "1q4MUpgVQNlu"
   },
   "source": [
    "自定义单个词性："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "2zZkH9tRQOoi",
    "outputId": "4f92a907-10c3-4798-e7b9-914b8f577b2c"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['state-of-the-art-tool',\n",
       " 'P',\n",
       " 'NN',\n",
       " 'NN',\n",
       " 'VV',\n",
       " 'JJ',\n",
       " 'NN',\n",
       " 'AD',\n",
       " 'VA',\n",
       " 'DEC',\n",
       " 'NN',\n",
       " 'NN',\n",
       " 'NN',\n",
       " 'PU']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pos.dict_tags = {'HanLP': 'state-of-the-art-tool'}\n",
    "pos([\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "F-9gAeIVQUFG"
   },
   "source": [
    "根据上下文自定义词性："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "F8M8cyBrQduw",
    "outputId": "24fa7ff0-305d-4d71-925e-f369b1c50e96"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['PN', '补语成分', '名词', 'VC', '动词', 'NR', 'DEG', 'NN', 'LB', 'NR', 'VV', 'PU']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pos.dict_tags = {('的', '希望'): ('补语成分', '名词'), '希望': '动词'}\n",
    "pos([\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "需要算法基础才能理解，初学者可参考[《自然语言处理入门》](http://nlp.hankcs.com/book.php)。"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "pos_stl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/sdp_mtl.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sdp_mtl.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fsdp_mtl.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "!pip install hanlp -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "IYwV-UkNNzFp",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n",
       " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n",
       " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n",
       " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n",
       " 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.mtl.ALL # MTL多任务，具体任务见模型名称，语种见名称最后一个字段或相应语料库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "1Uf_u7ddMhUt",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存。自然语言处理分为许多任务，分词只是最初级的一个。与其每个任务单独创建一个模型，不如利用HanLP的联合模型一次性完成多个任务："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "pp-1KqEOOJ4t",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 语义依存分析\n",
    "任务越少，速度越快。如指定仅执行语义依存分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
   },
   "outputs": [],
   "source": [
    "doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', tasks='sdp')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"tok/fine\": [\n",
      "    \"2021年\",\n",
      "    \"HanLPv2.1\",\n",
      "    \"为\",\n",
      "    \"生产\",\n",
      "    \"环境\",\n",
      "    \"带来\",\n",
      "    \"次\",\n",
      "    \"世代\",\n",
      "    \"最\",\n",
      "    \"先进\",\n",
      "    \"的\",\n",
      "    \"多\",\n",
      "    \"语种\",\n",
      "    \"NLP\",\n",
      "    \"技术\",\n",
      "    \"。\"\n",
      "  ],\n",
      "  \"sdp\": [\n",
      "    [[6, \"Time\"]],\n",
      "    [[6, \"Exp\"]],\n",
      "    [[5, \"mPrep\"]],\n",
      "    [[5, \"Desc\"]],\n",
      "    [[6, \"Datv\"]],\n",
      "    [[13, \"dDesc\"]],\n",
      "    [[0, \"Root\"], [8, \"Desc\"], [13, \"Desc\"]],\n",
      "    [[15, \"Time\"]],\n",
      "    [[10, \"mDegr\"]],\n",
      "    [[15, \"Desc\"]],\n",
      "    [[10, \"mAux\"]],\n",
      "    [[8, \"Quan\"], [13, \"Quan\"]],\n",
      "    [[15, \"Desc\"]],\n",
      "    [[15, \"Nmod\"]],\n",
      "    [[6, \"Pat\"]],\n",
      "    [[6, \"mPunc\"]]\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`doc['sdp']`字段代表语义依存图的数组格式，数组中第`i`个子数组代表第`i`个单词的语义依存关系，子数组中每个二元组的格式为`[中心词的下标, 与中心词的语义依存关系]`。每个单词的语义依存关系可能有零个、一个或多个（任意数量）。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "转换为[CoNLLSentence](https://hanlp.hankcs.com/docs/api/common/conll.html#hanlp_common.conll.CoNLLSentence)格式更容易观察："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2021年\t_\t_\t_\t_\t_\t_\t6:Time\t_\n",
      "2\tHanLPv2.1\t_\t_\t_\t_\t_\t_\t6:Exp\t_\n",
      "3\t为\t_\t_\t_\t_\t_\t_\t5:mPrep\t_\n",
      "4\t生产\t_\t_\t_\t_\t_\t_\t5:Desc\t_\n",
      "5\t环境\t_\t_\t_\t_\t_\t_\t6:Datv\t_\n",
      "6\t带来\t_\t_\t_\t_\t_\t_\t13:dDesc\t_\n",
      "7\t次\t_\t_\t_\t_\t_\t_\t0:Root|8:Desc|13:Desc\t_\n",
      "8\t世代\t_\t_\t_\t_\t_\t_\t15:Time\t_\n",
      "9\t最\t_\t_\t_\t_\t_\t_\t10:mDegr\t_\n",
      "10\t先进\t_\t_\t_\t_\t_\t_\t15:Desc\t_\n",
      "11\t的\t_\t_\t_\t_\t_\t_\t10:mAux\t_\n",
      "12\t多\t_\t_\t_\t_\t_\t_\t8:Quan|13:Quan\t_\n",
      "13\t语种\t_\t_\t_\t_\t_\t_\t15:Desc\t_\n",
      "14\tNLP\t_\t_\t_\t_\t_\t_\t15:Nmod\t_\n",
      "15\t技术\t_\t_\t_\t_\t_\t_\t6:Pat\t_\n",
      "16\t。\t_\t_\t_\t_\t_\t_\t6:mPunc\t_\n"
     ]
    }
   ],
   "source": [
    "print(doc.to_conll())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "XOsWkOqQfzlr"
   },
   "source": [
    "为已分词的句子执行语义依存分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "bLZSTbv_f3OA",
    "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\tHanLP\t_\t_\t_\t_\t_\t_\t5:Exp\t_\n",
      "2\t为\t_\t_\t_\t_\t_\t_\t4:mPrep\t_\n",
      "3\t生产\t_\t_\t_\t_\t_\t_\t4:Desc\t_\n",
      "4\t环境\t_\t_\t_\t_\t_\t_\t5:Datv\t_\n",
      "5\t带来\t_\t_\t_\t_\t_\t_\t0:Root\t_\n",
      "6\t次世代\t_\t_\t_\t_\t_\t_\t12:Time\t_\n",
      "7\t最\t_\t_\t_\t_\t_\t_\t8:mDegr\t_\n",
      "8\t先进\t_\t_\t_\t_\t_\t_\t12:Desc\t_\n",
      "9\t的\t_\t_\t_\t_\t_\t_\t8:mAux\t_\n",
      "10\t多语种\t_\t_\t_\t_\t_\t_\t12:Desc\t_\n",
      "11\tNLP\t_\t_\t_\t_\t_\t_\t12:Nmod\t_\n",
      "12\t技术\t_\t_\t_\t_\t_\t_\t5:Pat\t_\n",
      "13\t。\t_\t_\t_\t_\t_\t_\t5:mPunc\t_\n",
      "\n",
      "1\t我\t_\t_\t_\t_\t_\t_\t3:Poss\t_\n",
      "2\t的\t_\t_\t_\t_\t_\t_\t1:mAux\t_\n",
      "3\t希望\t_\t_\t_\t_\t_\t_\t4:Exp\t_\n",
      "4\t是\t_\t_\t_\t_\t_\t_\t11:mMod\t_\n",
      "5\t希望\t_\t_\t_\t_\t_\t_\t4:dClas\t_\n",
      "6\t张晚霞\t_\t_\t_\t_\t_\t_\t8:Poss\t_\n",
      "7\t的\t_\t_\t_\t_\t_\t_\t6:mAux\t_\n",
      "8\t背影\t_\t_\t_\t_\t_\t_\t11:Pat\t_\n",
      "9\t被\t_\t_\t_\t_\t_\t_\t10:mPrep\t_\n",
      "10\t晚霞\t_\t_\t_\t_\t_\t_\t11:Exp\t_\n",
      "11\t映红\t_\t_\t_\t_\t_\t_\t5:dCont\t_\n",
      "12\t。\t_\t_\t_\t_\t_\t_\t4:mPunc\t_\n"
     ]
    }
   ],
   "source": [
    "print(HanLP([\n",
    "    [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
    "    [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
    "  ], tasks='sdp', skip_tasks='tok*').to_conll())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 注意\n",
    "Native API的输入单位限定为句子，需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful同时支持全文、句子、已分词的句子。除此之外，RESTful和native两种API的语义设计完全一致，用户可以无缝互换。"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "sdp_mtl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/sdp_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sdp_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fsdp_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EmZDmLn9aGxG"
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 语义依存分析\n",
    "任务越少，速度越快。如指定仅执行语义依存分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
   },
   "outputs": [],
   "source": [
    "doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', tasks='sdp')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"tok/fine\": [\n",
      "    [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"]\n",
      "  ],\n",
      "  \"sdp\": [\n",
      "    [[[6, \"Time\"]], [[6, \"Agt\"]], [[5, \"mPrep\"]], [[5, \"Desc\"]], [[6, \"Datv\"]], [[0, \"Root\"]], [[8, \"Qp\"]], [[15, \"TDur\"]], [[10, \"mDegr\"]], [[15, \"Desc\"]], [[10, \"mAux\"]], [[13, \"Quan\"]], [[15, \"Desc\"]], [[15, \"Nmod\"]], [[6, \"Cont\"]], [[6, \"mPunc\"]]]\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`doc['sdp']`字段代表语义依存图的数组格式，数组中第`i`个子数组代表第`i`个单词的语义依存关系，子数组中每个二元组的格式为`[中心词的下标, 与中心词的语义依存关系]`。每个单词的语义依存关系可能有零个、一个或多个（任意数量）。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "转换为[CoNLLSentence](https://hanlp.hankcs.com/docs/api/common/conll.html#hanlp_common.conll.CoNLLSentence)格式更容易观察："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2021年\t_\t_\t_\t_\t_\t_\t6:Time\t_\n",
      "2\tHanLPv2.1\t_\t_\t_\t_\t_\t_\t6:Agt\t_\n",
      "3\t为\t_\t_\t_\t_\t_\t_\t5:mPrep\t_\n",
      "4\t生产\t_\t_\t_\t_\t_\t_\t5:Desc\t_\n",
      "5\t环境\t_\t_\t_\t_\t_\t_\t6:Datv\t_\n",
      "6\t带来\t_\t_\t_\t_\t_\t_\t0:Root\t_\n",
      "7\t次\t_\t_\t_\t_\t_\t_\t8:Qp\t_\n",
      "8\t世代\t_\t_\t_\t_\t_\t_\t15:TDur\t_\n",
      "9\t最\t_\t_\t_\t_\t_\t_\t10:mDegr\t_\n",
      "10\t先进\t_\t_\t_\t_\t_\t_\t15:Desc\t_\n",
      "11\t的\t_\t_\t_\t_\t_\t_\t10:mAux\t_\n",
      "12\t多\t_\t_\t_\t_\t_\t_\t13:Quan\t_\n",
      "13\t语种\t_\t_\t_\t_\t_\t_\t15:Desc\t_\n",
      "14\tNLP\t_\t_\t_\t_\t_\t_\t15:Nmod\t_\n",
      "15\t技术\t_\t_\t_\t_\t_\t_\t6:Cont\t_\n",
      "16\t。\t_\t_\t_\t_\t_\t_\t6:mPunc\t_\n"
     ]
    }
   ],
   "source": [
    "print(doc.to_conll())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "XOsWkOqQfzlr"
   },
   "source": [
    "为已分词的句子执行语义依存分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "bLZSTbv_f3OA",
    "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\tHanLP\t_\t_\t_\t_\t_\t_\t5:Agt\t_\n",
      "2\t为\t_\t_\t_\t_\t_\t_\t4:mPrep\t_\n",
      "3\t生产\t_\t_\t_\t_\t_\t_\t4:Desc\t_\n",
      "4\t环境\t_\t_\t_\t_\t_\t_\t5:Datv\t_\n",
      "5\t带来\t_\t_\t_\t_\t_\t_\t0:Root\t_\n",
      "6\t次世代\t_\t_\t_\t_\t_\t_\t12:Time\t_\n",
      "7\t最\t_\t_\t_\t_\t_\t_\t8:mDegr\t_\n",
      "8\t先进\t_\t_\t_\t_\t_\t_\t12:Desc\t_\n",
      "9\t的\t_\t_\t_\t_\t_\t_\t8:mAux\t_\n",
      "10\t多语种\t_\t_\t_\t_\t_\t_\t12:Desc\t_\n",
      "11\tNLP\t_\t_\t_\t_\t_\t_\t12:Nmod\t_\n",
      "12\t技术\t_\t_\t_\t_\t_\t_\t5:Cont\t_\n",
      "13\t。\t_\t_\t_\t_\t_\t_\t5:mPunc\t_\n",
      "\n",
      "1\t我\t_\t_\t_\t_\t_\t_\t3:Poss\t_\n",
      "2\t的\t_\t_\t_\t_\t_\t_\t1:mAux\t_\n",
      "3\t希望\t_\t_\t_\t_\t_\t_\t0:Root|4:Exp\t_\n",
      "4\t是\t_\t_\t_\t_\t_\t_\t5:mMod\t_\n",
      "5\t希望\t_\t_\t_\t_\t_\t_\t4:dClas\t_\n",
      "6\t张晚霞\t_\t_\t_\t_\t_\t_\t8:Poss\t_\n",
      "7\t的\t_\t_\t_\t_\t_\t_\t6:mAux\t_\n",
      "8\t背影\t_\t_\t_\t_\t_\t_\t11:Pat\t_\n",
      "9\t被\t_\t_\t_\t_\t_\t_\t10:mPrep\t_\n",
      "10\t晚霞\t_\t_\t_\t_\t_\t_\t11:Exp\t_\n",
      "11\t映红\t_\t_\t_\t_\t_\t_\t5:dCont\t_\n",
      "12\t。\t_\t_\t_\t_\t_\t_\t5:mPunc\t_\n"
     ]
    }
   ],
   "source": [
    "print(HanLP(tokens=[\n",
    "    [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
    "    [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
    "  ], tasks='sdp').to_conll())"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "sdp_restful.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/sdp_stl.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sdp_stl.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fsdp_stl.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "nf9TgeCTC0OT"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "jaW4eu6kC0OU",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "!pip install hanlp -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_xI_bLAaC0OU"
   },
   "source": [
    "## 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "IYwV-UkNNzFp",
    "outputId": "54065443-9b0a-444c-f6c0-c701bc86400b",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'SEMEVAL16_NEWS_BIAFFINE_ZH': 'https://file.hankcs.com/hanlp/sdp/semeval16-news-biaffine_20191231_235407.zip',\n",
       " 'SEMEVAL16_TEXT_BIAFFINE_ZH': 'https://file.hankcs.com/hanlp/sdp/semeval16-text-biaffine_20200101_002257.zip',\n",
       " 'SEMEVAL16_ALL_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/sdp/semeval16_sdp_electra_small_20220208_122026.zip',\n",
       " 'SEMEVAL15_PAS_BIAFFINE_EN': 'https://file.hankcs.com/hanlp/sdp/semeval15_biaffine_pas_20200103_152405.zip',\n",
       " 'SEMEVAL15_PSD_BIAFFINE_EN': 'https://file.hankcs.com/hanlp/sdp/semeval15_biaffine_psd_20200106_123009.zip',\n",
       " 'SEMEVAL15_DM_BIAFFINE_EN': 'https://file.hankcs.com/hanlp/sdp/semeval15_biaffine_dm_20200106_122808.zip'}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.sdp.ALL # 语种见名称最后一个字段或相应语料库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "1Uf_u7ddMhUt",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "pp-1KqEOOJ4t",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "sdp = hanlp.load('SEMEVAL16_ALL_ELECTRA_SMALL_ZH')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 语义依存分析\n",
    "语义依存分析的输入为已分词的一个或多个句子："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "id": "BqEmDMGGOtk3"
   },
   "outputs": [],
   "source": [
    "graph = sdp([\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "SwaPn1hjC0OW"
   },
   "source": [
    "返回对象为[CoNLLSentence](https://hanlp.hankcs.com/docs/api/common/conll.html#hanlp_common.conll.CoNLLSentence)类型："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "egpWwHKxC0OX",
    "outputId": "f7c77687-dd75-4fa2-dbd2-be6bda8a3fff"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'id': 1,\n",
       "  'form': '2021年',\n",
       "  'upos': None,\n",
       "  'xpos': None,\n",
       "  'head': None,\n",
       "  'deprel': None,\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'deps': [(6, 'Time')],\n",
       "  'misc': None},\n",
       " {'id': 2,\n",
       "  'form': 'HanLPv2.1',\n",
       "  'upos': None,\n",
       "  'xpos': None,\n",
       "  'head': None,\n",
       "  'deprel': None,\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'deps': [(6, 'Exp')],\n",
       "  'misc': None},\n",
       " {'id': 3,\n",
       "  'form': '为',\n",
       "  'upos': None,\n",
       "  'xpos': None,\n",
       "  'head': None,\n",
       "  'deprel': None,\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'deps': [(5, 'mPrep')],\n",
       "  'misc': None},\n",
       " {'id': 4,\n",
       "  'form': '生产',\n",
       "  'upos': None,\n",
       "  'xpos': None,\n",
       "  'head': None,\n",
       "  'deprel': None,\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'deps': [(5, 'Desc')],\n",
       "  'misc': None},\n",
       " {'id': 5,\n",
       "  'form': '环境',\n",
       "  'upos': None,\n",
       "  'xpos': None,\n",
       "  'head': None,\n",
       "  'deprel': None,\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'deps': [(6, 'Datv')],\n",
       "  'misc': None},\n",
       " {'id': 6,\n",
       "  'form': '带来',\n",
       "  'upos': None,\n",
       "  'xpos': None,\n",
       "  'head': None,\n",
       "  'deprel': None,\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'deps': [(2, 'eSucc')],\n",
       "  'misc': None},\n",
       " {'id': 7,\n",
       "  'form': '次',\n",
       "  'upos': None,\n",
       "  'xpos': None,\n",
       "  'head': None,\n",
       "  'deprel': None,\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'deps': [(8, 'Desc'), (13, 'Desc')],\n",
       "  'misc': None},\n",
       " {'id': 8,\n",
       "  'form': '世代',\n",
       "  'upos': None,\n",
       "  'xpos': None,\n",
       "  'head': None,\n",
       "  'deprel': None,\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'deps': [(0, 'Root'), (15, 'Time')],\n",
       "  'misc': None},\n",
       " {'id': 9,\n",
       "  'form': '最',\n",
       "  'upos': None,\n",
       "  'xpos': None,\n",
       "  'head': None,\n",
       "  'deprel': None,\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'deps': [(10, 'mDegr')],\n",
       "  'misc': None},\n",
       " {'id': 10,\n",
       "  'form': '先进',\n",
       "  'upos': None,\n",
       "  'xpos': None,\n",
       "  'head': None,\n",
       "  'deprel': None,\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'deps': [(15, 'Desc')],\n",
       "  'misc': None},\n",
       " {'id': 11,\n",
       "  'form': '的',\n",
       "  'upos': None,\n",
       "  'xpos': None,\n",
       "  'head': None,\n",
       "  'deprel': None,\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'deps': [(10, 'mAux')],\n",
       "  'misc': None},\n",
       " {'id': 12,\n",
       "  'form': '多',\n",
       "  'upos': None,\n",
       "  'xpos': None,\n",
       "  'head': None,\n",
       "  'deprel': None,\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'deps': [(10, 'mDegr'), (13, 'Quan')],\n",
       "  'misc': None},\n",
       " {'id': 13,\n",
       "  'form': '语种',\n",
       "  'upos': None,\n",
       "  'xpos': None,\n",
       "  'head': None,\n",
       "  'deprel': None,\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'deps': [(15, 'Desc')],\n",
       "  'misc': None},\n",
       " {'id': 14,\n",
       "  'form': 'NLP',\n",
       "  'upos': None,\n",
       "  'xpos': None,\n",
       "  'head': None,\n",
       "  'deprel': None,\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'deps': [(15, 'Desc')],\n",
       "  'misc': None},\n",
       " {'id': 15,\n",
       "  'form': '技术',\n",
       "  'upos': None,\n",
       "  'xpos': None,\n",
       "  'head': None,\n",
       "  'deprel': None,\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'deps': [(6, 'Pat')],\n",
       "  'misc': None},\n",
       " {'id': 16,\n",
       "  'form': '。',\n",
       "  'upos': None,\n",
       "  'xpos': None,\n",
       "  'head': None,\n",
       "  'deprel': None,\n",
       "  'lemma': None,\n",
       "  'feats': None,\n",
       "  'deps': [(6, 'mPunc')],\n",
       "  'misc': None}]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "graph"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "kq_j5TLFC0OX"
   },
   "source": [
    "打印为为CoNLL格式："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "isJhzYyIC0OX",
    "outputId": "683c8489-dffc-426e-f95b-e91dfb373260"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2021年\t_\t_\t_\t_\t_\t_\t6:Time\t_\n",
      "2\tHanLPv2.1\t_\t_\t_\t_\t_\t_\t6:Exp\t_\n",
      "3\t为\t_\t_\t_\t_\t_\t_\t5:mPrep\t_\n",
      "4\t生产\t_\t_\t_\t_\t_\t_\t5:Desc\t_\n",
      "5\t环境\t_\t_\t_\t_\t_\t_\t6:Datv\t_\n",
      "6\t带来\t_\t_\t_\t_\t_\t_\t2:eSucc\t_\n",
      "7\t次\t_\t_\t_\t_\t_\t_\t8:Desc|13:Desc\t_\n",
      "8\t世代\t_\t_\t_\t_\t_\t_\t0:Root|15:Time\t_\n",
      "9\t最\t_\t_\t_\t_\t_\t_\t10:mDegr\t_\n",
      "10\t先进\t_\t_\t_\t_\t_\t_\t15:Desc\t_\n",
      "11\t的\t_\t_\t_\t_\t_\t_\t10:mAux\t_\n",
      "12\t多\t_\t_\t_\t_\t_\t_\t10:mDegr|13:Quan\t_\n",
      "13\t语种\t_\t_\t_\t_\t_\t_\t15:Desc\t_\n",
      "14\tNLP\t_\t_\t_\t_\t_\t_\t15:Desc\t_\n",
      "15\t技术\t_\t_\t_\t_\t_\t_\t6:Pat\t_\n",
      "16\t。\t_\t_\t_\t_\t_\t_\t6:mPunc\t_\n"
     ]
    }
   ],
   "source": [
    "print(graph)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "sdp_stl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/sentiment_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sentiment_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fsentiment_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "nf9TgeCTC0OT"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "jaW4eu6kC0OU",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "!pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_xI_bLAaC0OU"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "IYwV-UkNNzFp",
    "outputId": "54065443-9b0a-444c-f6c0-c701bc86400b",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "1Uf_u7ddMhUt",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 情感分析\n",
    "情感分析任务的输入为文档："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "BqEmDMGGOtk3"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8418035507202148"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.sentiment_analysis('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "SwaPn1hjC0OW"
   },
   "source": [
    "返回值为文档的情感极性，表示为$[-1, +1]$之间的数值，数值的正负代表正负面情绪，数值的绝对值代表情感的强烈程度。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "egpWwHKxC0OX",
    "outputId": "f7c77687-dd75-4fa2-dbd2-be6bda8a3fff"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8327275514602661"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.sentiment_analysis('看哭了。感人肺腑。')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "kq_j5TLFC0OX"
   },
   "source": [
    "注意返回值的符号代表正负情感："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "isJhzYyIC0OX",
    "outputId": "683c8489-dffc-426e-f95b-e91dfb373260"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-0.8850911855697632"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.sentiment_analysis('看哭了。难看哭了。')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "绝对值的大小代表情感的强烈程度："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-0.9190718531608582"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.sentiment_analysis('看哭了。难看哭了！！！')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "长文档一样支持："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9505730271339417"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = '''“这是一部男人必看的电影。”人人都这么说。但单纯从性别区分，就会让这电影变狭隘。\n",
    "《肖申克的救赎》突破了男人电影的局限，通篇几乎充满令人难以置信的温馨基调，而电影里最伟大的主题是“希望”。\n",
    "当我们无奈地遇到了如同肖申克一般囚禁了心灵自由的那种囹圄，我们是无奈的老布鲁克，灰心的瑞德，还是智慧的安迪？\n",
    "运用智慧，信任希望，并且勇敢面对恐惧心理，去打败它？\n",
    "经典的电影之所以经典，因为他们都在做同一件事——让你从不同的角度来欣赏希望的美好。'''\n",
    "HanLP.sentiment_analysis(text)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "sentiment_restful.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}

================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/srl_mtl.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/srl_mtl.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fsrl_mtl.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n",
       " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n",
       " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n",
       " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n",
       " 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.mtl.ALL # MTL多任务，具体任务见模型名称，语种见名称最后一个字段或相应语料库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EmZDmLn9aGxG"
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存。自然语言处理分为许多任务，分词只是最初级的一个。与其每个任务单独创建一个模型，不如利用HanLP的联合模型一次性完成多个任务："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 语义角色分析\n",
    "任务越少，速度越快。如指定仅执行语义角色分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
   },
   "outputs": [],
   "source": [
    "doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', tasks='srl')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"tok/fine\": [\n",
      "    \"2021年\",\n",
      "    \"HanLPv2.1\",\n",
      "    \"为\",\n",
      "    \"生产\",\n",
      "    \"环境\",\n",
      "    \"带来\",\n",
      "    \"次\",\n",
      "    \"世代\",\n",
      "    \"最\",\n",
      "    \"先进\",\n",
      "    \"的\",\n",
      "    \"多\",\n",
      "    \"语种\",\n",
      "    \"NLP\",\n",
      "    \"技术\",\n",
      "    \"。\"\n",
      "  ],\n",
      "  \"srl\": [\n",
      "    [[\"2021年\", \"ARGM-TMP\", 0, 1], [\"HanLPv2.1\", \"ARG0\", 1, 2], [\"为生产环境\", \"ARG2\", 2, 5], [\"带来\", \"PRED\", 5, 6], [\"次世代最先进的多语种NLP技术\", \"ARG1\", 6, 15]],\n",
      "    [[\"最\", \"ARGM-ADV\", 8, 9], [\"先进\", \"PRED\", 9, 10], [\"技术\", \"ARG0\", 14, 15]]\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`doc['srl']`字段为语义角色标注结果，每个四元组的格式为`[论元或谓词, 语义角色标签, 起始下标, 终止下标]`。其中，谓词的语义角色标签为`PRED`，起止下标对应以`tok`开头的第一个单词数组。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "wxctCigrTKu-"
   },
   "source": [
    "可视化谓词论元结构："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Zo08uquCTFSk",
    "outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token    \tSRL PA1     \tToken    \tSRL PA2     \n",
      "─────────\t────────────\t─────────\t────────────\n",
      "2021年    \t───►ARGM-TMP\t2021年    \t            \n",
      "HanLPv2.1\t───►ARG0    \tHanLPv2.1\t            \n",
      "为        \t◄─┐         \t为        \t            \n",
      "生产       \t  ├►ARG2    \t生产       \t            \n",
      "环境       \t◄─┘         \t环境       \t            \n",
      "带来       \t╟──►PRED    \t带来       \t            \n",
      "次        \t◄─┐         \t次        \t            \n",
      "世代       \t  │         \t世代       \t            \n",
      "最        \t  │         \t最        \t───►ARGM-ADV\n",
      "先进       \t  │         \t先进       \t╟──►PRED    \n",
      "的        \t  ├►ARG1    \t的        \t            \n",
      "多        \t  │         \t多        \t            \n",
      "语种       \t  │         \t语种       \t            \n",
      "NLP      \t  │         \tNLP      \t            \n",
      "技术       \t◄─┘         \t技术       \t───►ARG0    \n",
      "。        \t            \t。        \t            \n"
     ]
    }
   ],
   "source": [
    "doc.pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "遍历谓词论元结构："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "第1个谓词论元结构：\n",
      "2021年 = ARGM-TMP at [0, 1]\n",
      "HanLPv2.1 = ARG0 at [1, 2]\n",
      "为生产环境 = ARG2 at [2, 5]\n",
      "带来 = PRED at [5, 6]\n",
      "次世代最先进的多语种NLP技术 = ARG1 at [6, 15]\n",
      "第2个谓词论元结构：\n",
      "最 = ARGM-ADV at [8, 9]\n",
      "先进 = PRED at [9, 10]\n",
      "技术 = ARG0 at [14, 15]\n"
     ]
    }
   ],
   "source": [
    "for i, pas in enumerate(doc['srl']):\n",
    "    print(f'第{i+1}个谓词论元结构：')\n",
    "    for form, role, begin, end in pas:\n",
    "        print(f'{form} = {role} at [{begin}, {end}]')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "XOsWkOqQfzlr"
   },
   "source": [
    "为已分词的句子执行语义角色分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "bLZSTbv_f3OA",
    "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token\tSRL PA1 \tToken\tSRL PA2     \n",
      "─────\t────────\t─────\t────────────\n",
      "HanLP\t───►ARG0\tHanLP\t            \n",
      "为    \t◄─┐     \t为    \t            \n",
      "生产   \t  ├►ARG2\t生产   \t            \n",
      "环境   \t◄─┘     \t环境   \t            \n",
      "带来   \t╟──►PRED\t带来   \t            \n",
      "次世代  \t◄─┐     \t次世代  \t            \n",
      "最    \t  │     \t最    \t───►ARGM-ADV\n",
      "先进   \t  │     \t先进   \t╟──►PRED    \n",
      "的    \t  ├►ARG1\t的    \t            \n",
      "多语种  \t  │     \t多语种  \t            \n",
      "NLP  \t  │     \tNLP  \t            \n",
      "技术   \t◄─┘     \t技术   \t───►ARG0    \n",
      "。    \t        \t。    \t            \n",
      "\n",
      "Tok\tSRL PA1 \tTok\tSRL PA2 \tTok\tSRL PA3 \n",
      "───\t────────\t───\t────────\t───\t────────\n",
      "我  \t◄─┐     \t我  \t        \t我  \t        \n",
      "的  \t  ├►ARG0\t的  \t        \t的  \t        \n",
      "希望 \t◄─┘     \t希望 \t        \t希望 \t        \n",
      "是  \t╟──►PRED\t是  \t        \t是  \t        \n",
      "希望 \t◄─┐     \t希望 \t╟──►PRED\t希望 \t        \n",
      "张晚霞\t  │     \t张晚霞\t◄─┐     \t张晚霞\t        \n",
      "的  \t  │     \t的  \t  │     \t的  \t        \n",
      "背影 \t  ├►ARG1\t背影 \t  │     \t背影 \t        \n",
      "被  \t  │     \t被  \t  ├►ARG1\t被  \t        \n",
      "晚霞 \t  │     \t晚霞 \t  │     \t晚霞 \t───►ARG0\n",
      "映红 \t◄─┘     \t映红 \t◄─┘     \t映红 \t╟──►PRED\n",
      "。  \t        \t。  \t        \t。  \t        \n"
     ]
    }
   ],
   "source": [
    "HanLP([\n",
    "    [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
    "    [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
    "  ], tasks='srl', skip_tasks='tok*').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 注意\n",
    "Native API的输入单位限定为句子，需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful同时支持全文、句子、已分词的句子。除此之外，RESTful和native两种API的语义设计完全一致，用户可以无缝互换。"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "srl_mtl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/srl_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/srl_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fsrl_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EmZDmLn9aGxG"
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 语义角色分析\n",
    "任务越少，速度越快。如指定仅执行语义角色分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
   },
   "outputs": [],
   "source": [
    "doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', tasks='srl')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"tok/fine\": [\n",
      "    [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"]\n",
      "  ],\n",
      "  \"srl\": [\n",
      "    [[[\"2021年\", \"ARGM-TMP\", 0, 1], [\"HanLPv2.1\", \"ARG0\", 1, 2], [\"为生产环境\", \"ARG2\", 2, 5], [\"带来\", \"PRED\", 5, 6], [\"次世代最先进的多语种NLP技术\", \"ARG1\", 6, 15]], [[\"次世代\", \"ARGM-TMP\", 6, 8], [\"最\", \"ARGM-ADV\", 8, 9], [\"先进\", \"PRED\", 9, 10], [\"NLP技术\", \"ARG0\", 13, 15]]]\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`doc['srl']`字段为语义角色标注结果，每个四元组的格式为`[论元或谓词, 语义角色标签, 起始下标, 终止下标]`。其中，谓词的语义角色标签为`PRED`，起止下标对应以`tok`开头的第一个单词数组。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "wxctCigrTKu-"
   },
   "source": [
    "可视化谓词论元结构："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Zo08uquCTFSk",
    "outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token    \tSRL PA1     \tToken    \tSRL PA2     \n",
      "─────────\t────────────\t─────────\t────────────\n",
      "2021年    \t───►ARGM-TMP\t2021年    \t            \n",
      "HanLPv2.1\t───►ARG0    \tHanLPv2.1\t            \n",
      "为        \t◄─┐         \t为        \t            \n",
      "生产       \t  ├►ARG2    \t生产       \t            \n",
      "环境       \t◄─┘         \t环境       \t            \n",
      "带来       \t╟──►PRED    \t带来       \t            \n",
      "次        \t◄─┐         \t次        \t◄─┐         \n",
      "世代       \t  │         \t世代       \t◄─┴►ARGM-TMP\n",
      "最        \t  │         \t最        \t───►ARGM-ADV\n",
      "先进       \t  │         \t先进       \t╟──►PRED    \n",
      "的        \t  ├►ARG1    \t的        \t            \n",
      "多        \t  │         \t多        \t            \n",
      "语种       \t  │         \t语种       \t            \n",
      "NLP      \t  │         \tNLP      \t◄─┐         \n",
      "技术       \t◄─┘         \t技术       \t◄─┴►ARG0    \n",
      "。        \t            \t。        \t            \n"
     ]
    }
   ],
   "source": [
    "doc.pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "遍历谓词论元结构："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "第1个谓词论元结构：\n",
      "2021年 = ARGM-TMP at [0, 1]\n",
      "HanLPv2.1 = ARG0 at [1, 2]\n",
      "为生产环境 = ARG2 at [2, 5]\n",
      "带来 = PRED at [5, 6]\n",
      "次世代最先进的多语种NLP技术 = ARG1 at [6, 15]\n",
      "第2个谓词论元结构：\n",
      "次世代 = ARGM-TMP at [6, 8]\n",
      "最 = ARGM-ADV at [8, 9]\n",
      "先进 = PRED at [9, 10]\n",
      "NLP技术 = ARG0 at [13, 15]\n"
     ]
    }
   ],
   "source": [
    "for i, pas in enumerate(doc['srl'][0]):\n",
    "    print(f'第{i+1}个谓词论元结构：')\n",
    "    for form, role, begin, end in pas:\n",
    "        print(f'{form} = {role} at [{begin}, {end}]')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "XOsWkOqQfzlr"
   },
   "source": [
    "为已分词的句子执行语义角色分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "bLZSTbv_f3OA",
    "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token\tSRL PA1 \tToken\tSRL PA2     \n",
      "─────\t────────\t─────\t────────────\n",
      "HanLP\t───►ARG0\tHanLP\t            \n",
      "为    \t◄─┐     \t为    \t            \n",
      "生产   \t  ├►ARG2\t生产   \t            \n",
      "环境   \t◄─┘     \t环境   \t            \n",
      "带来   \t╟──►PRED\t带来   \t            \n",
      "次世代  \t◄─┐     \t次世代  \t───►ARGM-TMP\n",
      "最    \t  │     \t最    \t───►ARGM-ADV\n",
      "先进   \t  │     \t先进   \t╟──►PRED    \n",
      "的    \t  ├►ARG1\t的    \t            \n",
      "多语种  \t  │     \t多语种  \t            \n",
      "NLP  \t  │     \tNLP  \t            \n",
      "技术   \t◄─┘     \t技术   \t───►ARG0    \n",
      "。    \t        \t。    \t            \n",
      "\n",
      "Tok\tSRL PA1 \tTok\tSRL PA2 \tTok\tSRL PA3 \n",
      "───\t────────\t───\t────────\t───\t────────\n",
      "我  \t◄─┐     \t我  \t        \t我  \t        \n",
      "的  \t  ├►ARG0\t的  \t        \t的  \t        \n",
      "希望 \t◄─┘     \t希望 \t        \t希望 \t        \n",
      "是  \t╟──►PRED\t是  \t        \t是  \t        \n",
      "希望 \t◄─┐     \t希望 \t╟──►PRED\t希望 \t        \n",
      "张晚霞\t  │     \t张晚霞\t◄─┐     \t张晚霞\t◄─┐     \n",
      "的  \t  │     \t的  \t  │     \t的  \t  ├►ARG1\n",
      "背影 \t  ├►ARG1\t背影 \t  │     \t背影 \t◄─┘     \n",
      "被  \t  │     \t被  \t  ├►ARG1\t被  \t        \n",
      "晚霞 \t  │     \t晚霞 \t  │     \t晚霞 \t───►ARG0\n",
      "映红 \t◄─┘     \t映红 \t◄─┘     \t映红 \t╟──►PRED\n",
      "。  \t        \t。  \t        \t。  \t        \n"
     ]
    }
   ],
   "source": [
    "HanLP(tokens=[\n",
    "    [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
    "    [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
    "  ], tasks='srl', skip_tasks='tok*').pretty_print()"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "srl_restful.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/srl_stl.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/srl_mtl.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fsrl_mtl.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'CPB3_SRL_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/srl/cpb3_electra_small_crf_has_transform_20220218_135910.zip'}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.srl.ALL # 语种见名称最后一个字段或相应语料库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EmZDmLn9aGxG"
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "srl = hanlp.load('CPB3_SRL_ELECTRA_SMALL')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 语义角色分析\n",
    "为已分词的句子执行语义角色分析："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[('2021年', 'ARGM-TMP', 0, 1),\n",
       "  ('HanLPv2.1', 'ARG0', 1, 2),\n",
       "  ('为生产环境', 'ARG2', 2, 5),\n",
       "  ('带来', 'PRED', 5, 6),\n",
       "  ('次世代最先进的多语种NLP技术', 'ARG1', 6, 15)],\n",
       " [('次世代', 'ARGM-TMP', 6, 8),\n",
       "  ('最', 'ARGM-ADV', 8, 9),\n",
       "  ('先进', 'PRED', 9, 10),\n",
       "  ('技术', 'ARG0', 14, 15)]]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "srl(['2021年', 'HanLPv2.1', '为', '生产', '环境', '带来', '次', '世代', '最', '先进', '的', '多', '语种', 'NLP', '技术', '。'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "语义角色标注结果中每个四元组的格式为`[论元或谓词, 语义角色标签, 起始下标, 终止下标]`。其中，谓词的语义角色标签为`PRED`，起止下标对应单词数组。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "遍历谓词论元结构："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "第1个谓词论元结构：\n",
      "2021年 = ARGM-TMP at [0, 1]\n",
      "HanLPv2.1 = ARG0 at [1, 2]\n",
      "为生产环境 = ARG2 at [2, 5]\n",
      "带来 = PRED at [5, 6]\n",
      "次世代最先进的多语种NLP技术 = ARG1 at [6, 15]\n",
      "第2个谓词论元结构：\n",
      "次世代 = ARGM-TMP at [6, 8]\n",
      "最 = ARGM-ADV at [8, 9]\n",
      "先进 = PRED at [9, 10]\n",
      "技术 = ARG0 at [14, 15]\n"
     ]
    }
   ],
   "source": [
    "for i, pas in enumerate(srl(['2021年', 'HanLPv2.1', '为', '生产', '环境', '带来', '次', '世代', '最', '先进', '的', '多', '语种', 'NLP', '技术', '。'])):\n",
    "    print(f'第{i+1}个谓词论元结构：')\n",
    "    for form, role, begin, end in pas:\n",
    "        print(f'{form} = {role} at [{begin}, {end}]')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 注意\n",
    "Native API的输入单位限定为句子，需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful同时支持全文、句子、已分词的句子。除此之外，RESTful和native两种API的语义设计完全一致，用户可以无缝互换。"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "srl_mtl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/sts_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sts_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fsts_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EmZDmLn9aGxG"
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 语义文本相似度\n",
    "输入两段短文本组成的二元组列表，执行语义文本相似度："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0.9764469861984253, 0.0, 0.003458738327026367]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.semantic_textual_similarity([\n",
    "    ('看图猜一电影名', '看图猜电影'),\n",
    "    ('无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'),\n",
    "    ('北京到上海的动车票', '上海到北京的动车票'),\n",
    "])"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "sts_restful.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}

================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/sts_stl.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sts_stl.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Fsts_stl.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.sts.ALL # 语种见名称最后一个字段或相应语料库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EmZDmLn9aGxG"
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "sts = hanlp.load(hanlp.pretrained.sts.STS_ELECTRA_BASE_ZH)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 语义文本相似度\n",
    "输入两段短文本组成的二元组列表，执行语义文本相似度："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0.9764469861984253, 0.0, 0.003458738327026367]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sts([\n",
    "    ('看图猜一电影名', '看图猜电影'),\n",
    "    ('无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'),\n",
    "    ('北京到上海的动车票', '上海到北京的动车票'),\n",
    "])"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "sts_stl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-31 20:36


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_classifier.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-01 03:52
from hanlp.datasets.classification.sentiment import CHNSENTICORP_ERNIE_TEST

import hanlp

classifier = hanlp.load('CHNSENTICORP_BERT_BASE_ZH')
print(classifier.predict('前台客房服务态度非常好！早餐很丰富，房价很干净。再接再厉！'))

# predict a whole file in batch mode
outputs = classifier.predict(classifier.transform.file_to_inputs(CHNSENTICORP_ERNIE_TEST), gold=True)
print(outputs[:5])


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_client.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-08 04:43
# pip3 install tensorflow-serving-api-gpu
import grpc
import tensorflow as tf
from tensorflow_core.python.framework import tensor_util
from tensorflow_serving.apis import predict_pb2, prediction_service_pb2_grpc
import hanlp
from hanlp.common.keras_component import KerasComponent

tagger: KerasComponent = hanlp.load(hanlp.pretrained.pos.CTB5_POS_RNN, transform_only=True)
transform = tagger.transform
del tagger

inputs = [['商品', '和', '服务'],
          ['我', '的', '希望', '是', '希望', '和平']]

samples = next(iter(transform.inputs_to_dataset(inputs)))[0]
print(samples)

channel = grpc.insecure_channel('{host}:{port}'.format(host='localhost', port=8500))
stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
request = predict_pb2.PredictRequest()
request.model_spec.name = 'ctb5_pos_rnn_20191229_015325'
request.model_spec.signature_name = 'serving_default'
request.inputs['embedding_input'].CopyFrom(
    tf.make_tensor_proto(samples, dtype=tf.float32))
result = stub.Predict(request, 10.0)  # 10 secs timeout
print(result)
prediction = tensor_util.MakeNdarray(result.outputs['dense'])
print(prediction)

print(list(transform.Y_to_outputs(prediction, inputs=inputs)))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_cws.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 21:25
import hanlp

tokenizer = hanlp.load(hanlp.pretrained.tok.LARGE_ALBERT_BASE)
print(tokenizer('商品和服务'))
print(tokenizer(['萨哈夫说，伊拉克将同联合国销毁伊拉克大规模杀伤性武器特别委员会继续保持合作。',
                 '上海华安工业（集团）公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。',
                 'HanLP支援臺灣正體、香港繁體，具有新詞辨識能力的中文斷詞系統']))

text = 'NLP统计模型没有加规则，聪明人知道自己加。英文、数字、自定义词典统统都是规则。'
print(tokenizer(text))

dic = {'自定义词典': 'custom_dict', '聪明人': 'smart'}


def split_by_dic(text: str):
    # We use regular expression for the sake of simplicity.
    # However, you should use some trie trees for production
    import re
    p = re.compile('(' + '|'.join(dic.keys()) + ')')
    sents, offset, words = [], 0, []
    for m in p.finditer(text):
        if offset < m.start():
            sents.append(text[offset: m.start()])
            words.append((m.group(), dic[m.group()]))
            offset = m.end()
    if offset < len(text):
        sents.append(text[offset:])
        words.append((None, None))
    flat = []
    for pred, (word, tag) in zip(tokenizer(sents), words):
        flat.extend(pred)
        if word:
            flat.append((word, tag))
    return flat


print(split_by_dic(text))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_cws_trie.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 21:25
from hanlp_trie.trie import Trie

import hanlp

tokenizer = hanlp.load('LARGE_ALBERT_BASE')
text = 'NLP统计模型没有加规则，聪明人知道自己加。英文、数字、自定义词典统统都是规则。'
print(tokenizer(text))

trie = Trie()
trie.update({'自定义词典': 'custom_dict', '聪明人': 'smart'})


def split_sents(text: str, trie: Trie):
    words = trie.parse_longest(text)
    sents = []
    pre_start = 0
    offsets = []
    for start, end, value in words:
        if pre_start != start:
            sents.append(text[pre_start: start])
            offsets.append(pre_start)
        pre_start = end
    if pre_start != len(text):
        sents.append(text[pre_start:])
        offsets.append(pre_start)
    return sents, offsets, words


print(split_sents(text, trie))


def merge_parts(parts, offsets, words):
    items = [(i, p) for (i, p) in zip(offsets, parts)]
    items += [(start, [value]) for (start, end, value) in words]
    return [each for x in sorted(items) for each in x[1]]


tokenizer = hanlp.pipeline() \
    .append(split_sents, output_key=('parts', 'offsets', 'words'), trie=trie) \
    .append(tokenizer, input_key='parts', output_key='tokens') \
    .append(merge_parts, input_key=('tokens', 'offsets', 'words'), output_key='merged')

print(tokenizer(text))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_dep.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 21:25
import hanlp

syntactic_parser = hanlp.load(hanlp.pretrained.dep.CTB7_BIAFFINE_DEP_ZH)
sent = [('蜡烛', 'NN'), ('两', 'CD'), ('头', 'NN'), ('烧', 'VV')]
tree = syntactic_parser(sent)
print(tree)


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_fasttext.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-12 18:33
import hanlp
import torch

# fasttext is a `torch.nn.Module`. Unless you know how to code in
# PyTorch, otherwise don't bother to use this.
fasttext = hanlp.load(hanlp.pretrained.fasttext.FASTTEXT_WIKI_300_ZH)

vec = fasttext('单词')
print(vec)

print(torch.nn.functional.cosine_similarity(fasttext('单词'), fasttext('词语'), dim=0))
print(torch.nn.functional.cosine_similarity(fasttext('单词'), fasttext('今天'), dim=0))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_multiprocess.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-02-15 11:30
import multiprocessing
import hanlp

tokenizer = hanlp.load(hanlp.pretrained.tok.LARGE_ALBERT_BASE)


def worker(job):
    print(job)
    print(tokenizer(job))


if __name__ == '__main__':
    num_proc = 2
    # Important! The python multiprocessing package defaults to just call fork when creating a child process.
    # This cannot work when the child process calls async code (i.e TensorFlow is multithreaded).
    # See https://github.com/tensorflow/tensorflow/issues/8220#issuecomment-302826884
    # See https://sefiks.com/2019/03/20/tips-and-tricks-for-gpu-and-multiprocessing-in-tensorflow/
    multiprocessing.set_start_method('spawn', force=True)  # only spawn works with TensorFlow
    with multiprocessing.Pool(num_proc) as pool:
        pool.map(worker, [f'给{i}号进程的任务' for i in range(num_proc)])


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-30 19:52
import hanlp

recognizer = hanlp.load(hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH)
print(recognizer.predict([list('上海华安工业（集团）公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。'),
                          list('萨哈夫说，伊拉克将同联合国销毁伊拉克大规模杀伤性武器特别委员会继续保持合作。')]))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_pipeline.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-31 03:24

import hanlp

tokenizer = hanlp.load('LARGE_ALBERT_BASE')
tagger = hanlp.load('CTB9_POS_ALBERT_BASE')
syntactic_parser = hanlp.load('CTB7_BIAFFINE_DEP_ZH')
semantic_parser = hanlp.load('SEMEVAL16_TEXT_BIAFFINE_ZH')

pipeline = hanlp.pipeline() \
    .append(hanlp.utils.rules.split_sentence, output_key='sentences') \
    .append(tokenizer, output_key='tokens') \
    .append(tagger, output_key='part_of_speech_tags') \
    .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies', conll=False) \
    .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies', conll=False)
print(pipeline)

text = '''HanLP是一系列模型与算法组成的自然语言处理工具包，目标是普及自然语言处理在生产环境中的应用。
HanLP具备功能完善、性能高效、架构清晰、语料时新、可自定义的特点。
内部算法经过工业界和学术界考验，配套书籍《自然语言处理入门》已经出版。
'''

doc = pipeline(text)
print(doc)
# By default the doc is json serializable, it holds true if your pipes output json serializable object too.
# print(json.dumps(doc, ensure_ascii=False, indent=2))

# You can save the config to disk for deploying or sharing.
pipeline.save('zh.json')
# Then load it smoothly.
deployed = hanlp.load('zh.json')
print(deployed)
print(deployed(text))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_pos.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 21:25
import hanlp
from hanlp.pretrained.pos import CTB9_POS_ALBERT_BASE

tagger = hanlp.load(CTB9_POS_ALBERT_BASE)
print(tagger.predict(['我', '的', '希望', '是', '希望', '世界', '和平']))
print(tagger.predict([['支持', '批处理', '地', '预测'], ['速度', '更', '快']]))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_sdp.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-31 23:55
import hanlp

semantic_parser = hanlp.load('SEMEVAL16_NEWS_BIAFFINE_ZH')
sent = [('蜡烛', 'NN'), ('两', 'CD'), ('头', 'NN'), ('烧', 'VV')]
print(semantic_parser(sent))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_serving.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-06 20:23
import hanlp
from hanlp.common.keras_component import KerasComponent

tagger: KerasComponent = hanlp.load(hanlp.pretrained.pos.CTB5_POS_RNN)
print(tagger('商品 和 服务'.split()))
tagger.serve()


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-26 23:25


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-01 20:55

================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_ctb6_cws_albert.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:22

from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF
from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST
from tests import cdroot

cdroot()
tokenizer = TransformerTokenizerTF()
save_dir = 'data/model/cws_bert_albert_ctb6'
tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_DEV, save_dir,
              transformer='/home/ubuntu/hankcs/laser/data/transformer/albert_base_tf2',
              metrics='f1', learning_rate=5e-5, epochs=3)
tokenizer.load(save_dir)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_ctb6_cws_bert.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:22

from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF
from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST
from tests import cdroot

cdroot()
tokenizer = TransformerTokenizerTF()
save_dir = 'data/model/cws_bert_base_ctb6'
tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_DEV, save_dir, transformer='chinese_L-12_H-768_A-12',
              metrics='f1')
tokenizer.load(save_dir)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_ctb6_cws_convseg.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:22
import tensorflow as tf

from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF
from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST
from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR
from tests import cdroot

cdroot()
tokenizer = NgramConvTokenizerTF()
save_dir = 'data/model/cws/ctb6_cws'
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
                                     epsilon=1e-8, clipnorm=5)
tokenizer.fit(CTB6_CWS_TRAIN,
              CTB6_CWS_DEV,
              save_dir,
              word_embed={'class_name': 'HanLP>Word2VecEmbedding',
                          'config': {
                              'trainable': True,
                              'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
                              'expand_vocab': False,
                              'lowercase': False,
                          }},
              optimizer=optimizer,
              window_size=0,
              weight_norm=True)
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_bert_cws.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:39
from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF
from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_DEV, CTB6_CWS_TEST
from tests import cdroot

cdroot()
tokenizer = TransformerTokenizerTF()
save_dir = 'data/model/cws_bert_base_100million'
tokenizer.fit('data/cws/large/all.txt', CTB6_CWS_DEV, save_dir, transformer='bert-base-chinese',
              metrics='accuracy', batch_size=32)
tokenizer.load(save_dir, metrics='f1')
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_conv_cws.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 21:58

import tensorflow as tf

from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF
from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST
from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR
from tests import cdroot

cdroot()
tokenizer = NgramConvTokenizerTF()
save_dir = 'data/model/cws/ctb6_cws'
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
                                     epsilon=1e-8, clipnorm=5)
tokenizer.fit(CTB6_CWS_TRAIN,
              CTB6_CWS_DEV,
              save_dir,
              word_embed={'class_name': 'HanLP>Word2VecEmbedding',
                          'config': {
                              'trainable': True,
                              'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
                              'expand_vocab': False,
                              'lowercase': False,
                          }},
              optimizer=optimizer,
              window_size=0,
              weight_norm=True)
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_cws_albert.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:22
from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF
from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_DEV, CTB6_CWS_TEST
from tests import cdroot

cdroot()
tokenizer = TransformerTokenizerTF()
save_dir = 'data/model/large_corpus_cws_albert_base'
tokenizer.fit('data/cws/large/all.txt',
              CTB6_CWS_DEV, save_dir,
              transformer='uer/albert-base-chinese-cluecorpussmall',
              max_seq_length=128,
              metrics='accuracy', learning_rate=5e-5, epochs=3)
tokenizer.load(save_dir, metrics='f1')
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_cws_electra.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:22
from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF
from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_DEV, CTB6_CWS_TEST
from tests import cdroot

cdroot()
tokenizer = TransformerTokenizerTF()
save_dir = 'data/model/large_corpus_cws_electra_small'
tokenizer.fit('data/cws/large/all.txt',
              CTB6_CWS_DEV, save_dir,
              transformer='hfl/chinese-electra-small-discriminator',
              max_seq_length=128,
              metrics='accuracy', learning_rate=5e-5, epochs=10)
tokenizer.load(save_dir, metrics='f1')
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_rnn_cws.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:39
import tensorflow as tf

from hanlp.components.tokenizers.tok_tf import RNNTokenizerTF
from hanlp.datasets.cws.ctb import CTB6_CWS_TEST, CTB6_CWS_DEV
from hanlp.pretrained.word2vec import RADICAL_CHAR_EMBEDDING_100
from tests import cdroot

cdroot()

tokenizer = RNNTokenizerTF()
save_dir = 'data/model/cws/large_rnn_cws'
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
                                     epsilon=1e-8, clipnorm=5)
tokenizer.fit('data/cws/large/all.txt',
              CTB6_CWS_DEV,
              save_dir,
              embeddings={'class_name': 'HanLP>Word2VecEmbedding',
                          'config': {
                              'trainable': True,
                              'filepath': RADICAL_CHAR_EMBEDDING_100,
                              'expand_vocab': False,
                              'lowercase': False,
                          }},
              early_stopping_patience=5,
              batch_size=64,
              max_seq_len=64,
              metrics='accuracy'
              )
tokenizer.load(save_dir, metrics='f1')
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_msr_cws_albert.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:22

from hanlp.components.tokenizers.tok import TransformerTokenizer
from hanlp.datasets.cws.ctb import CTB6_CWS_TEST
from hanlp.datasets.tokenization.sighan2005.msr import SIGHAN2005_MSR_VALID, SIGHAN2005_MSR_TRAIN
from tests import cdroot

cdroot()
tokenizer = TransformerTokenizer()
save_dir = 'data/model/msr_cws_albert_base'
tokenizer.fit(SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_VALID, save_dir,
              transformer='albert_base_zh',
              max_seq_length=150,
              metrics='f1', learning_rate=5e-5, epochs=10)
tokenizer.load(save_dir)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_msr_cws_bert.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:39
from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF
from hanlp.datasets.tokenization.sighan2005.msr import SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_DEV, SIGHAN2005_MSR_TEST
from tests import cdroot

cdroot()
tokenizer = TransformerTokenizerTF()
save_dir = 'data/model/cws_bert_base_msra'
tokenizer.fit(SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_DEV, save_dir, transformer='bert-base-chinese',
              metrics='f1')
# tagger.load(save_dir)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(SIGHAN2005_MSR_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_msr_cws_ngram_conv.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:39
import tensorflow as tf

from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF
from hanlp.datasets.tokenization.sighan2005.msr import SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_DEV, SIGHAN2005_MSR_TEST
from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR
from tests import cdroot

cdroot()
tokenizer = NgramConvTokenizerTF()
save_dir = 'data/model/cws/convseg-msr-nocrf-noembed'
tokenizer.fit(SIGHAN2005_MSR_TRAIN,
              SIGHAN2005_MSR_DEV,
              save_dir,
              word_embed={'class_name': 'HanLP>Word2VecEmbedding',
                          'config': {
                              'trainable': True,
                              'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
                              'expand_vocab': False,
                              'lowercase': False,
                          }},
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001,
                                                 epsilon=1e-8, clipnorm=5),
              epochs=100,
              window_size=0,
              metrics='f1',
              weight_norm=True)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.evaluate(SIGHAN2005_MSR_TEST, save_dir=save_dir)


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_msr_cws_ngram_conv_embed.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:39
import tensorflow as tf

from hanlp.components.tokenizers.tok import NgramConvTokenizer
from hanlp.datasets.tokenization.sighan2005.msr import SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_VALID, SIGHAN2005_MSR_TEST
from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR, CONVSEG_W2V_NEWS_TENSITE_WORD_MSR
from tests import cdroot

cdroot()
tokenizer = NgramConvTokenizer()
save_dir = 'data/model/cws/convseg-msr-nocrf-noembed'
tokenizer.fit(SIGHAN2005_MSR_TRAIN,
              SIGHAN2005_MSR_VALID,
              save_dir,
              word_embed={'class_name': 'HanLP>Word2VecEmbedding',
                          'config': {
                              'trainable': True,
                              'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
                              'expand_vocab': False,
                              'lowercase': False,
                          }},
              ngram_embed={'class_name': 'HanLP>Word2VecEmbedding',
                           'config': {
                               'trainable': True,
                               'filepath': CONVSEG_W2V_NEWS_TENSITE_WORD_MSR,
                               'expand_vocab': True,
                               'lowercase': False,
                           }},
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001,
                                                 epsilon=1e-8, clipnorm=5),
              epochs=3,
              window_size=4,
              metrics='f1',
              weight_norm=True)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
tokenizer.load(save_dir, metrics='f1')
tokenizer.evaluate(SIGHAN2005_MSR_TEST, save_dir=save_dir)


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_pku980106_conv_cws.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:39
import tensorflow as tf

from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF
from hanlp.pretrained.word2vec import RADICAL_CHAR_EMBEDDING_100
from tests import cdroot

cdroot()

tokenizer = NgramConvTokenizerTF()
save_dir = 'data/model/cws/pku98_6m_conv_ngram'
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
                                     epsilon=1e-8, clipnorm=5)
tokenizer.fit('data/cws/pku98/199801-06-seg.txt',
              'data/cws/pku98/test_pku98_name_merged.txt',
              save_dir,
              word_embed={'class_name': 'HanLP>Word2VecEmbedding',
                          'config': {
                              'trainable': False,
                              'filepath': RADICAL_CHAR_EMBEDDING_100,
                              'expand_vocab': True,
                              'lowercase': False,
                          }},
              optimizer=optimizer,
              window_size=0,
              weight_norm=True)
tokenizer.evaluate('data/cws/pku98/test_pku98_name_merged.txt', save_dir=save_dir, output=False)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_pku980106_rnn_cws.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:39
import tensorflow as tf

from hanlp.components.tokenizers.tok_tf import RNNTokenizerTF
from hanlp.pretrained.word2vec import RADICAL_CHAR_EMBEDDING_100
from tests import cdroot

cdroot()

tokenizer = RNNTokenizerTF()
save_dir = 'data/model/cws/pku_6m_rnn_cws'
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
                                     epsilon=1e-8, clipnorm=5)
tokenizer.fit('data/cws/pku98/199801-06-seg.txt',
              'data/cws/pku98/pku98_test.txt',
              save_dir,
              embeddings={'class_name': 'HanLP>Word2VecEmbedding',
                          'config': {
                              'trainable': False,
                              'filepath': RADICAL_CHAR_EMBEDDING_100,
                              'expand_vocab': True,
                              'lowercase': False,
                          }}
              )
tokenizer.evaluate('data/cws/pku98/pku98_test.txt', save_dir=save_dir, output=False)
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_pku_conv_cws.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:39
from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005_PKU_TRAIN, SIGHAN2005_PKU_DEV, SIGHAN2005_PKU_TEST
from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR
from hanlp.utils.tf_util import nice
from tests import cdroot
import tensorflow as tf

nice()
cdroot()
from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF

tokenizer = NgramConvTokenizerTF()
save_dir = 'data/model/cws/sighan2005-pku-convseg'
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
                                     epsilon=1e-8, clipnorm=5)
tokenizer.fit(SIGHAN2005_PKU_TRAIN,
              SIGHAN2005_PKU_DEV,
              save_dir,
              word_embed={'class_name': 'HanLP>Word2VecEmbedding',
                          'config': {
                              'trainable': True,
                              'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
                              'expand_vocab': False,
                              'lowercase': False,
                          }},
              optimizer=optimizer,
              window_size=0,
              weight_norm=True)
tokenizer.evaluate(SIGHAN2005_PKU_TEST, save_dir=save_dir, output=False)
# print(tagger.tag(list('中央民族乐团离开北京前往维也纳')))
# print(tagger.predict('中央民族乐团离开北京前往维也纳'))
print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/finetune_msra_ner_albert.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 23:15
import hanlp
from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF
from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST
from tests import cdroot

cdroot()
recognizer = TransformerNamedEntityRecognizerTF()
save_dir = 'data/model/ner/finetune_ner_albert_base_zh_msra'
recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir, transformer='albert_base_zh',
               finetune=hanlp.pretrained.ner.MSRA_NER_ALBERT_BASE_ZH)
recognizer.load(save_dir)
print(recognizer.predict(list('上海华安工业（集团）公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。')))
recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_chnsenticorp_bert.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-30 21:01
from hanlp.components.classifiers.transformer_classifier_tf import TransformerClassifierTF, TransformerTextTransform
from hanlp.datasets.classification.sentiment import CHNSENTICORP_ERNIE_TRAIN, CHNSENTICORP_ERNIE_TEST, \
    CHNSENTICORP_ERNIE_DEV
from tests import cdroot

cdroot()
save_dir = 'data/model/classification/chnsenticorp_bert_base'
classifier = TransformerClassifierTF(TransformerTextTransform(y_column=0))
classifier.fit(CHNSENTICORP_ERNIE_TRAIN, CHNSENTICORP_ERNIE_DEV, save_dir,
               transformer='bert-base-chinese')
classifier.load(save_dir)
print(classifier.predict('前台客房服务态度非常好！早餐很丰富，房价很干净。再接再厉！'))
classifier.evaluate(CHNSENTICORP_ERNIE_TEST, save_dir=save_dir)


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_conll03_ner_bert.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-25 21:34
from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF
from hanlp.datasets.ner.conll03 import CONLL03_EN_TRAIN, CONLL03_EN_DEV, CONLL03_EN_TEST
from tests import cdroot

cdroot()
tagger = TransformerNamedEntityRecognizerTF()
save_dir = 'data/model/ner/ner_conll03_bert_base_cased_en'
tagger.fit(CONLL03_EN_TRAIN, CONLL03_EN_DEV, save_dir, transformer='bert-base-cased',
           metrics='accuracy')
tagger.load(save_dir, metrics='f1')
print(tagger.predict('West Indian all-rounder Phil Simmons eats apple .'.split()))
tagger.evaluate(CONLL03_EN_TEST, save_dir=save_dir, output=False, batch_size=32)
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_conll03_ner_flair.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-25 21:34

import tensorflow as tf

from hanlp.components.ner.ner_tf import RNNNamedEntityRecognizerTF
from hanlp.datasets.ner.conll03 import CONLL03_EN_TRAIN, CONLL03_EN_TEST
from hanlp.pretrained.glove import GLOVE_6B_100D
from hanlp.pretrained.rnnlm import FLAIR_LM_FW_WMT11_EN_TF, FLAIR_LM_BW_WMT11_EN_TF
from tests import cdroot

cdroot()
tagger = RNNNamedEntityRecognizerTF()
save_dir = 'data/model/conll03-ner-rnn-flair'
tagger.fit(CONLL03_EN_TRAIN, CONLL03_EN_TEST, save_dir, epochs=100,
           optimizer=tf.keras.optimizers.Adam(learning_rate=0.1,
                                              beta_1=0.9,
                                              beta_2=0.999,
                                              epsilon=1e-8),
           loss='crf',
           rnn_units=256,
           embeddings=[
               {'class_name': 'HanLP>Word2VecEmbedding',
                'config': {
                    'trainable': False,
                    'embeddings_initializer': 'zero',
                    'filepath': GLOVE_6B_100D,
                    'expand_vocab': True,
                    'lowercase': False
                }},
               {'class_name': 'HanLP>ContextualStringEmbedding',
                'config': {
                    'trainable': False,
                    'forward_model_path': FLAIR_LM_FW_WMT11_EN_TF,
                    'backward_model_path': FLAIR_LM_BW_WMT11_EN_TF
                }}
           ],
           rnn_output_dropout=0.5,
           rnn_input_dropout=0.5,
           batch_size=32,
           metrics='f1',
           anneal_factor=0.5,
           patience=2,
           )
print(tagger.predict('West Indian all-rounder Phil Simmons eats apple .'.split()))
# print(tagger.predict([['This', 'is', 'an', 'old', 'story'],
#                       ['Not', 'this', 'year', '.']]))
# [['DT', 'VBZ', 'DT', 'JJ', 'NN'], ['RB', 'DT', 'NN', '.']]
# tagger.load(save_dir)
tagger.evaluate(CONLL03_EN_TEST, save_dir=save_dir, output=False)


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb5_dep.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 18:33
from hanlp.components.parsers.biaffine_parser_tf import BiaffineDependencyParserTF
from hanlp.datasets.parsing.ctb5 import CTB5_DEP_TRAIN, CTB5_DEP_DEV, CTB5_DEP_TEST
from hanlp.pretrained.word2vec import CTB5_FASTTEXT_300_CN
from tests import cdroot

cdroot()
save_dir = 'data/model/dep/biaffine_ctb'
parser = BiaffineDependencyParserTF()
parser.fit(CTB5_DEP_TRAIN, CTB5_DEP_DEV, save_dir,
           pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding',
                             'config': {
                                 'trainable': False,
                                 'embeddings_initializer': 'zero',
                                 'filepath': CTB5_FASTTEXT_300_CN,
                                 'expand_vocab': True,
                                 'lowercase': True,
                                 'normalize': True,
                             }},
           )
parser.load(save_dir)
sentence = [('中国', 'NR'), ('批准', 'VV'), ('设立', 'VV'), ('外商', 'NN'), ('投资', 'NN'), ('企业', 'NN'), ('逾', 'VV'),
            ('三十万', 'CD'), ('家', 'M')]
print(parser.predict(sentence))
parser.evaluate(CTB5_DEP_TEST, save_dir)


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb5_pos_rnn.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:46
from hanlp.components.taggers.pos_tf import RNNPartOfSpeechTaggerTF
from hanlp.datasets.pos.ctb5 import CTB5_POS_TRAIN, CTB5_POS_DEV, CTB5_POS_TEST
from hanlp.pretrained.fasttext import FASTTEXT_WIKI_300_ZH
from tests import cdroot

cdroot()
tagger = RNNPartOfSpeechTaggerTF()
save_dir = 'data/model/pos/ctb5_pos_rnn_fasttext'
tagger.fit(CTB5_POS_TRAIN, CTB5_POS_DEV, save_dir, embeddings={'class_name': 'HanLP>FastTextEmbedding',
                                                                 'config': {'filepath': FASTTEXT_WIKI_300_ZH}}, )
tagger.evaluate(CTB5_POS_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb7_dep.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 18:33
from hanlp.components.parsers.biaffine_parser_tf import BiaffineDependencyParserTF
from hanlp.datasets.parsing.ctb5 import CIP_W2V_100_CN
from hanlp.datasets.parsing.ctb7 import CTB7_DEP_TRAIN, CTB7_DEP_DEV, CTB7_DEP_TEST
from tests import cdroot

cdroot()
save_dir = 'data/model/dep/biaffine_ctb7'
parser = BiaffineDependencyParserTF()
parser.fit(CTB7_DEP_TRAIN, CTB7_DEP_DEV, save_dir,
           pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding',
                             'config': {
                                 'trainable': False,
                                 'embeddings_initializer': 'zero',
                                 'filepath': CIP_W2V_100_CN,
                                 'expand_vocab': True,
                                 'lowercase': True,
                                 'normalize': True,
                             }},
           )
parser.load(save_dir)
sentence = [('中国', 'NR'), ('批准', 'VV'), ('设立', 'VV'), ('外商', 'NN'), ('投资', 'NN'), ('企业', 'NN'), ('逾', 'VV'),
            ('三十万', 'CD'), ('家', 'M')]
print(parser.predict(sentence))
parser.evaluate(CTB7_DEP_TEST, save_dir)


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb9_pos_albert.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 23:15
from hanlp.components.taggers.transformers.transformer_tagger_tf import TransformerTaggerTF
from tests import cdroot

cdroot()
tagger = TransformerTaggerTF()
save_dir = 'data/model/pos/ctb9_albert_base'
tagger.fit('data/pos/ctb9/train.tsv',
           'data/pos/ctb9/test.tsv',
           save_dir,
           transformer='uer/albert-base-chinese-cluecorpussmall',
           max_seq_length=130,
           warmup_steps_ratio=0.1,
           epochs=20,
           learning_rate=5e-5)
tagger.load(save_dir)
print(tagger(['我', '的', '希望', '是', '希望', '和平']))
tagger.evaluate('data/pos/ctb9/test.tsv', save_dir=save_dir)
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb9_pos_electra.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 23:15
from hanlp.components.taggers.transformers.transformer_tagger_tf import TransformerTaggerTF
from tests import cdroot

cdroot()
tagger = TransformerTaggerTF()
save_dir = 'data/model/pos/ctb9_electra_small_zh_epoch_20'
tagger.fit('data/pos/ctb9/train.tsv',
           'data/pos/ctb9/test.tsv',
           save_dir,
           transformer='hfl/chinese-electra-small-discriminator',
           max_seq_length=130,
           warmup_steps_ratio=0.1,
           epochs=20,
           learning_rate=5e-5)
tagger.load(save_dir)
print(tagger(['我', '的', '希望', '是', '希望', '和平']))
tagger.evaluate('data/pos/ctb9/test.tsv', save_dir=save_dir)
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_albert.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 23:15
from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF
from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST
from tests import cdroot

cdroot()
recognizer = TransformerNamedEntityRecognizerTF()
save_dir = 'data/model/ner/msra_ner_albert_base'
recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir,
               transformer='uer/albert-base-chinese-cluecorpussmall',
               learning_rate=5e-5,
               metrics='accuracy')  # Use accuracy to speed up training
recognizer.load(save_dir, metrics='f1')
print(recognizer.predict(list('上海华安工业（集团）公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。')))
recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_bert.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 23:15
from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF
from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST
from tests import cdroot

cdroot()
recognizer = TransformerNamedEntityRecognizerTF()
save_dir = 'data/model/ner/ner_bert_base_msra_1'
recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir, transformer='bert-base-chinese',
               metrics='accuracy')  # accuracy is faster
recognizer.load(save_dir, metrics='f1')
print(recognizer.predict(list('上海华安工业（集团）公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。')))
recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_electra.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 23:15
from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF
from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST
from tests import cdroot

cdroot()
recognizer = TransformerNamedEntityRecognizerTF()
save_dir = 'data/model/ner/ner_electra_small_zh_msra_sparse_categorical_crossentropy'
recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir,
               transformer='hfl/chinese-electra-small-discriminator',
               learning_rate=5e-5,
               metrics='accuracy')  # Use accuracy to speed up training
recognizer.load(save_dir, metrics='f1')
print(recognizer.predict(list('上海华安工业（集团）公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。')))
recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_ngram_conv.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 23:15
from hanlp.components.ner.ner_tf import NgramConvNamedEntityRecognizerTF
from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST
from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR, \
    CONVSEG_W2V_NEWS_TENSITE_WORD_MSR
from tests import cdroot

cdroot()
recognizer = NgramConvNamedEntityRecognizerTF()
save_dir = 'data/model/ner/msra_ner_ngram_conv'
recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir,
               word_embed={'class_name': 'HanLP>Word2VecEmbedding',
                           'config': {
                               'trainable': True,
                               'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
                               'expand_vocab': False,
                               'lowercase': False,
                           }},
               ngram_embed={'class_name': 'HanLP>Word2VecEmbedding',
                            'config': {
                                'trainable': True,
                                'filepath': CONVSEG_W2V_NEWS_TENSITE_WORD_MSR,
                                'expand_vocab': True,
                                'lowercase': False,
                            }},
               weight_norm=True)
recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir)


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_rnn.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 23:15
from hanlp.components.ner.ner_tf import RNNNamedEntityRecognizerTF
from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST
from hanlp.pretrained.word2vec import RADICAL_CHAR_EMBEDDING_100
from tests import cdroot

cdroot()
recognizer = RNNNamedEntityRecognizerTF()
save_dir = 'data/model/ner/msra_ner_rnn'
recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir,
               embeddings=RADICAL_CHAR_EMBEDDING_100,
               embedding_trainable=True,
               epochs=100)
recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir)


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_biaffine_albert.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-07 23:48
from hanlp.metrics.parsing import conllx_eval

from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING
from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF
from tests import cdroot

cdroot()
save_dir = 'data/model/dep/ptb_albert3'
parser = BiaffineTransformerDependencyParserTF()
parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir,
           'albert-xxlarge-v2',
           batch_size=256,
           warmup_steps_ratio=.1,
           token_mapping=PTB_TOKEN_MAPPING,
           samples_per_batch=150,
           transformer_dropout=.33,
           learning_rate=2e-3,
           learning_rate_transformer=1e-5,
           # early_stopping_patience=10,
           )
parser.load(save_dir)
# output = f'{save_dir}/test.predict.conll'
parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False)
# uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output)
# print(f'Official UAS: {uas:.4f} LAS: {las:.4f}')
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_biaffine_bert.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-07 23:48
from hanlp.metrics.parsing import conllx_eval

from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING
from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF
from tests import cdroot

cdroot()
save_dir = 'data/model/dep/ptb_bert_1e-5'
parser = BiaffineTransformerDependencyParserTF()
# parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased',
#            batch_size=3000,
#            warmup_steps_ratio=.1,
#            token_mapping=PTB_TOKEN_MAPPING,
#            samples_per_batch=150,
#            transformer_dropout=.33,
#            learning_rate=2e-3,
#            learning_rate_transformer=1e-5,
#            # early_stopping_patience=10,
#            )
parser.load(save_dir, tree='tarjan')
# output = f'{save_dir}/test.predict.conll'
parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False)
# uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output)
# print(f'Official UAS: {uas:.4f} LAS: {las:.4f}')
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_biaffine_bert_96.6.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-07 23:48

from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING
from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF
from tests import cdroot
from hanlp.metrics.parsing import conllx_eval

cdroot()
save_dir = 'data/model/dep/ptb_bert_96.61'
parser = BiaffineTransformerDependencyParserTF()
# parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased',
#            batch_size=3000,
#            warmup_steps_ratio=.1,
#            token_mapping=PTB_TOKEN_MAPPING,
#            samples_per_batch=150,
#            )
parser.load(save_dir)
output = f'{save_dir}/test.predict.conll'
parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False, output=output)
uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output)
print(f'Official UAS: {uas:.4f} LAS: {las:.4f}')
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_biaffine_bert_positional.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-07 23:48
from hanlp.metrics.parsing import conllx_eval

from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING
from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF
from tests import cdroot

cdroot()
save_dir = 'data/model/dep/ptb_bert_positional_diff_lr'
parser = BiaffineTransformerDependencyParserTF()
parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased',
           batch_size=3000,
           warmup_steps_ratio=.1,
           token_mapping=PTB_TOKEN_MAPPING,
           samples_per_batch=150,
           transformer_dropout=.33,
           learning_rate=1e-4,
           learning_rate_transformer=1e-5,
           d_positional=128,
           # early_stopping_patience=10,
           )
# parser.load(save_dir)
# output = f'{save_dir}/test.predict.conll'
parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False)
# uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output)
# print(f'Official UAS: {uas:.4f} LAS: {las:.4f}')
# print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_sa_albert.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-07 23:48
from hanlp.metrics.parsing import conllx_eval

from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING
from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF, \
    StructuralAttentionDependencyParserTF
from hanlp.pretrained.glove import GLOVE_840B_300D
from tests import cdroot

cdroot()
save_dir = 'data/model/dep/ptb_sa_glove'
parser = StructuralAttentionDependencyParserTF()
# parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased',
#            batch_size=3000,
#            warmup_steps_ratio=.1,
#            token_mapping=PTB_TOKEN_MAPPING,
#            samples_per_batch=150,
#            transformer_dropout=.33,
#            masked_lm_dropout=.33,
#            # learning_rate=2e-3,
#            # learning_rate_transformer=1e-5,
#            masked_lm_embed={'class_name': 'HanLP>Word2VecEmbedding',
#                             'config': {
#                                 'trainable': False,
#                                 # 'embeddings_initializer': 'zero',
#                                 'filepath': GLOVE_840B_300D,
#                                 'expand_vocab': False,
#                                 'lowercase': True,
#                                 'cpu': False
#                             }}
#            # alpha=1,
#            # early_stopping_patience=10,
#            # num_decoder_layers=2,
#            )
parser.load(save_dir)
# output = f'{save_dir}/test.predict.conll'
parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False)
# uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output)
# print(f'Official UAS: {uas:.4f} LAS: {las:.4f}')
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_sa_albert_topk.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-07 23:48
from hanlp.metrics.parsing import conllx_eval

from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING
from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF, \
    StructuralAttentionDependencyParserTF
from hanlp.pretrained.glove import GLOVE_840B_300D
from tests import cdroot

cdroot()
save_dir = 'data/model/dep/ptb_sa_topk'
parser = StructuralAttentionDependencyParserTF()
parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased',
           batch_size=3000,
           warmup_steps_ratio=.1,
           token_mapping=PTB_TOKEN_MAPPING,
           samples_per_batch=150,
           transformer_dropout=.33,
           masked_lm_dropout=.33,
           learning_rate=2e-3,
           learning_rate_transformer=1e-5,

           # alpha=1,
           # early_stopping_patience=10,
           # num_decoder_layers=2,
           )
parser.load(save_dir)
# output = f'{save_dir}/test.predict.conll'
parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False)
# uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output)
# print(f'Official UAS: {uas:.4f} LAS: {las:.4f}')
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_sa_bert.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-07 23:48
from hanlp.metrics.parsing import conllx_eval

from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING
from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF, \
    StructuralAttentionDependencyParserTF
from hanlp.pretrained.glove import GLOVE_840B_300D
from tests import cdroot

cdroot()
save_dir = 'data/model/dep/ptb_sa_glove'
parser = StructuralAttentionDependencyParserTF()
# parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased',
#            batch_size=3000,
#            warmup_steps_ratio=.1,
#            token_mapping=PTB_TOKEN_MAPPING,
#            samples_per_batch=150,
#            transformer_dropout=.33,
#            masked_lm_dropout=.33,
#            # learning_rate=2e-3,
#            # learning_rate_transformer=1e-5,
#            masked_lm_embed={'class_name': 'HanLP>Word2VecEmbedding',
#                             'config': {
#                                 'trainable': False,
#                                 # 'embeddings_initializer': 'zero',
#                                 'filepath': GLOVE_840B_300D,
#                                 'expand_vocab': False,
#                                 'lowercase': True,
#                                 'cpu': False
#                             }}
#            # alpha=1,
#            # early_stopping_patience=10,
#            # num_decoder_layers=2,
#            )
parser.load(save_dir)
# output = f'{save_dir}/test.predict.conll'
parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False)
# uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output)
# print(f'Official UAS: {uas:.4f} LAS: {las:.4f}')
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_sa_pos_bert.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-07 23:48
from hanlp.metrics.parsing import conllx_eval

from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING
from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF, \
    StructuralAttentionDependencyParserTF
from hanlp.pretrained.glove import GLOVE_840B_300D
from tests import cdroot

cdroot()
save_dir = 'data/model/dep/ptb_sa_bert_joint_pos'
parser = StructuralAttentionDependencyParserTF()
parser.fit('data/ptb-dep/train.conllx', 'data/ptb-dep/dev.conllx', save_dir, 'bert-base-uncased',
           batch_size=256,
           warmup_steps_ratio=.1,
           token_mapping=PTB_TOKEN_MAPPING,
           samples_per_batch=150,
           transformer_dropout=.33,
           masked_lm_dropout=.33,
           learning_rate=2e-3,
           learning_rate_transformer=1e-5,
           joint_pos=True
           # alpha=1,
           # early_stopping_patience=10,
           # num_decoder_layers=2,
           )
# parser.load(save_dir)
# output = f'{save_dir}/test.predict.conll'
parser.evaluate('data/ptb-dep/test.conllx', save_dir, warm_up=False)
# uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output)
# print(f'Official UAS: {uas:.4f} LAS: {las:.4f}')
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_pos_rnn_fasttext.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-25 21:34

import tensorflow as tf

from hanlp.components.taggers.pos_tf import RNNPartOfSpeechTaggerTF
from hanlp.pretrained.fasttext import FASTTEXT_CC_300_EN
from tests import cdroot

cdroot()
tagger = RNNPartOfSpeechTaggerTF()
save_dir = 'data/model/pos/ptb_pos_rnn_fasttext'
optimizer = tf.keras.optimizers.SGD(lr=0.015)
# optimizer = 'adam'
tagger.fit('data/ptb-pos/train.tsv',
           'data/ptb-pos/dev.tsv',
           batch_size=10,
           save_dir=save_dir,
           embeddings={'class_name': 'HanLP>FastTextEmbedding',
                       'config': {'filepath': FASTTEXT_CC_300_EN}},
           optimizer=optimizer,
           lr_decay_per_epoch=0.05,
           rnn_units=100,
           rnn_input_dropout=0.5,
           rnn_output_dropout=0.5,
           epochs=100,
           verbose=True)
tagger.load(save_dir)
tagger.evaluate('data/ptb-pos/test.tsv', save_dir=save_dir, output=False)
print(tagger.predict(['This' 'time', 'is', 'for', 'dinner']))
print(tagger.predict([['This', 'is', 'an', 'old', 'story'],
                      ['Not', 'this', 'year', '.']]))
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_semeval15_dm.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-01 18:26
from hanlp.components.parsers.biaffine_parser_tf import BiaffineSemanticDependencyParserTF
from hanlp.pretrained.glove import GLOVE_6B_100D
from tests import cdroot

cdroot()
save_dir = 'data/model/sdp/semeval15_biaffine_dm'
parser = BiaffineSemanticDependencyParserTF()
parser.fit('data/semeval15/en.dm.train.conll', 'data/semeval15/en.dm.dev.conll', save_dir,
           pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding',
                             'config': {
                                 'trainable': False,
                                 'embeddings_initializer': 'zero',
                                 'filepath': GLOVE_6B_100D,
                                 'expand_vocab': True,
                                 'lowercase': True,
                                 'normalize': True,
                             }},
           )
parser.load(save_dir)  # disable variational dropout during evaluation so as to use CudaLSTM
sentence = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'), ('of', 'IN'), ('chamber', 'NN'),
            ('music', 'NN'), ('?', '.')]
print(parser.predict(sentence))
parser.evaluate('data/semeval15/en.id.dm.auto.conllu', save_dir)
parser.evaluate('data/semeval15/en.ood.dm.auto.conllu', save_dir)


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_semeval15_pas.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-01 18:26
from hanlp.components.parsers.biaffine_parser_tf import BiaffineSemanticDependencyParserTF
from hanlp.pretrained.glove import GLOVE_6B_100D
from tests import cdroot

cdroot()
save_dir = 'data/model/sdp/semeval15_biaffine_pas'
parser = BiaffineSemanticDependencyParserTF()
parser.fit('data/semeval15/en.pas.train.conll', 'data/semeval15/en.pas.dev.conll', save_dir,
           pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding',
                             'config': {
                                 'trainable': False,
                                 'embeddings_initializer': 'zero',
                                 'filepath': GLOVE_6B_100D,
                                 'expand_vocab': True,
                                 'lowercase': True,
                                 'normalize': True,
                             }},
           )
parser.load(save_dir)  # disable variational dropout during evaluation so as to use CudaLSTM
sentence = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'), ('of', 'IN'), ('chamber', 'NN'),
            ('music', 'NN'), ('?', '.')]
print(parser.predict(sentence))
parser.evaluate('data/semeval15/en.id.pas.conll', save_dir)
parser.evaluate('data/semeval15/en.ood.pas.conll', save_dir)


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_semeval15_psd.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-01 18:26
from hanlp.components.parsers.biaffine_parser_tf import BiaffineSemanticDependencyParserTF
from hanlp.pretrained.glove import GLOVE_6B_100D
from tests import cdroot

cdroot()
save_dir = 'data/model/sdp/semeval15_biaffine_psd'
parser = BiaffineSemanticDependencyParserTF()
parser.fit('data/semeval15/en.psd.train.conll', 'data/semeval15/en.psd.dev.conll', save_dir,
           pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding',
                             'config': {
                                 'trainable': False,
                                 'embeddings_initializer': 'zero',
                                 'filepath': GLOVE_6B_100D,
                                 'expand_vocab': True,
                                 'lowercase': True,
                                 'normalize': True,
                             }},
           )
parser.load(save_dir)  # disable variational dropout during evaluation so as to use CudaLSTM
sentence = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'), ('of', 'IN'), ('chamber', 'NN'),
            ('music', 'NN'), ('?', '.')]
print(parser.predict(sentence))
parser.evaluate('data/semeval15/en.id.psd.conll', save_dir)
parser.evaluate('data/semeval15/en.ood.psd.conll', save_dir)


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_semeval16_news.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-26 23:20
from hanlp.datasets.parsing.semeval16 import SEMEVAL2016_NEWS_TRAIN, SEMEVAL2016_NEWS_DEV, SEMEVAL2016_NEWS_TEST
from hanlp.pretrained.word2vec import SEMEVAL16_EMBEDDINGS_300_NEWS_CN
from hanlp.utils.tf_util import nice

nice()
from hanlp.components.parsers.biaffine_parser_tf import BiaffineSemanticDependencyParserTF
from tests import cdroot

cdroot()
save_dir = 'data/model/sdp/semeval16-news'
parser = BiaffineSemanticDependencyParserTF()
parser.fit(SEMEVAL2016_NEWS_TRAIN, SEMEVAL2016_NEWS_DEV, save_dir,
           pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding',
                             'config': {
                                 'trainable': False,
                                 'embeddings_initializer': 'zero',
                                 'filepath': SEMEVAL16_EMBEDDINGS_300_NEWS_CN,
                                 'expand_vocab': True,
                                 'lowercase': True,
                                 'normalize': True,
                             }},
           )
parser.load(save_dir)
sentence = [('中国', 'NR'), ('批准', 'VV'), ('设立', 'VV'), ('外商', 'NN'), ('投资', 'NN'), ('企业', 'NN'), ('逾', 'VV'),
            ('三十万', 'CD'), ('家', 'M')]
print(parser.predict(sentence))
parser.evaluate(SEMEVAL2016_NEWS_TEST, save_dir)


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_semeval16_text.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-26 23:20
from hanlp.datasets.parsing.semeval16 import SEMEVAL2016_TEXT_TRAIN, SEMEVAL2016_TEXT_DEV, SEMEVAL2016_TEXT_TEST
from hanlp.pretrained.word2vec import SEMEVAL16_EMBEDDINGS_300_TEXT_CN
from hanlp.utils.tf_util import nice

nice()
from hanlp.components.parsers.biaffine_parser_tf import BiaffineSemanticDependencyParserTF
from tests import cdroot

cdroot()
save_dir = 'data/model/sdp/semeval16-text'
parser = BiaffineSemanticDependencyParserTF()
parser.fit(SEMEVAL2016_TEXT_TRAIN, SEMEVAL2016_TEXT_DEV, save_dir,
           pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding',
                             'config': {
                                 'trainable': False,
                                 'embeddings_initializer': 'zero',
                                 'filepath': SEMEVAL16_EMBEDDINGS_300_TEXT_CN,
                                 'expand_vocab': True,
                                 'lowercase': True,
                                 'normalize': True,
                             }},
           )
parser.load(save_dir)
sentence = [('中国', 'NR'), ('批准', 'VV'), ('设立', 'VV'), ('外商', 'NN'), ('投资', 'NN'), ('企业', 'NN'), ('逾', 'VV'),
            ('三十万', 'CD'), ('家', 'M')]
print(parser.predict(sentence))
parser.evaluate(SEMEVAL2016_TEXT_TEST, save_dir)


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tok_mtl.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "view-in-github"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tok_mtl.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Ftok_mtl.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "4M7ka0K5OMWU",
    "outputId": "9a1dc26a-786a-4dce-c013-7ae5017a8805"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n",
       " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n",
       " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n",
       " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n",
       " 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.mtl.ALL # MTL多任务，具体任务见模型名称，语种见名称最后一个字段或相应语料库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "BMW528wGNulM"
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存。自然语言处理分为许多任务，分词只是最初级的一个。与其每个任务单独创建一个模型，不如利用HanLP的联合模型一次性完成多个任务："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "0tmKBu7sNAXX",
    "outputId": "e0187328-c6d2-47fe-cf84-c5b44703940b"
   },
   "outputs": [],
   "source": [
    "HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 分词\n",
    "任务越少，速度越快。如指定仅执行分词，默认细粒度："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "387cbf30-4d70-44b1-d64b-b7a5c22ae31e"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "阿婆主 来到 北京 立方庭 参观 自然 语义 科技 公司 。\n"
     ]
    }
   ],
   "source": [
    "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='tok').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "jj1Jk-2sPHYx"
   },
   "source": [
    "执行粗颗粒度分词："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "id": "1goEC7znPNkI",
    "outputId": "ddf15a17-2f5d-4bc3-d145-908fb6176552"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "阿婆主 来到 北京立方庭 参观 自然语义科技公司 。\n"
     ]
    }
   ],
   "source": [
    "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='tok/coarse').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "wxctCigrTKu-"
   },
   "source": [
    "同时执行细粒度和粗粒度分词："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Zo08uquCTFSk",
    "outputId": "bf24a01a-a09b-4b78-fdec-2bb705b4becb"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'tok/fine': ['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司', '。'],\n",
       " 'tok/coarse': ['阿婆主', '来到', '北京立方庭', '参观', '自然语义科技公司', '。']}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='tok*')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`coarse`为粗分，`fine`为细分。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 注意\n",
    "Native API的输入单位限定为句子，需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful同时支持全文、句子、已分词的句子。除此之外，RESTful和native两种API的语义设计完全一致，用户可以无缝互换。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "suUL042zPpLj"
   },
   "source": [
    "## 自定义词典\n",
    "自定义词典为分词任务的成员变量，要操作自定义词典，先获取分词任务，以细分标准为例："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "AzYShIssP6kq",
    "outputId": "7f07897c-8a97-4193-855d-d9e296581d0c"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<hanlp.components.mtl.tasks.tok.tag_tok.TaggingTokenization at 0x1527337f0>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tok = HanLP['tok/fine']\n",
    "tok"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "自定义词典为分词任务的成员变量："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "id": "1q4MUpgVQNlu",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(None, None)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tok.dict_combine, tok.dict_force"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "2zZkH9tRQOoi",
    "outputId": "c231c35b-1a5f-4b54-e5c3-8680d2cc1515",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "HanLP支持合并和强制两种优先级的自定义词典，以满足不同场景的需求。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "F-9gAeIVQUFG",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "不挂词典："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "F8M8cyBrQduw",
    "outputId": "c3bf7ec5-b1d4-4207-a979-2c85754c7cd7",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "商品 和 服务 项目\n"
     ]
    }
   ],
   "source": [
    "tok.dict_force = tok.dict_combine = None\n",
    "HanLP(\"商品和服务项目\", tasks='tok/fine').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "DDqQxqQaTayv",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "### 强制模式\n",
    "强制模式优先输出正向最长匹配到的自定义词条（慎用，详见[《自然语言处理入门》](http://nlp.hankcs.com/book.php)第二章）："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "bjnEqDaATdVr",
    "outputId": "3a282acc-5716-45e4-e1e2-96eefb8ee342",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "商品 和服 务 项目\n"
     ]
    }
   ],
   "source": [
    "tok.dict_force = {'和服', '服务项目'}\n",
    "HanLP(\"商品和服务项目\", tasks='tok/fine').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "ldKAnVoSTgxb",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "与大众的朴素认知不同，词典优先级最高未必是好事，极有可能匹配到不该分出来的自定义词语，导致歧义。自定义词语越长，越不容易发生歧义。这启发我们将强制模式拓展为强制校正功能。\n",
    "\n",
    "强制校正原理相似，但会将匹配到的自定义词条替换为相应的分词结果:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "bwIu0f6wTgbF",
    "outputId": "b941b079-5202-420a-e7f3-8f1617a2545c",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "商品 和 服务 项目\n"
     ]
    }
   ],
   "source": [
    "tok.dict_force = {'和服务': ['和', '服务']}\n",
    "HanLP(\"商品和服务项目\", tasks='tok/fine').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 合并模式\n",
    "合并模式的优先级低于统计模型，即`dict_combine`会在统计模型的分词结果上执行最长匹配并合并匹配到的词条。一般情况下，推荐使用该模式。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "商品 和 服务项目\n"
     ]
    }
   ],
   "source": [
    "tok.dict_force = None\n",
    "tok.dict_combine = {'和服', '服务项目'}\n",
    "HanLP(\"商品和服务项目\", tasks='tok/fine').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "9aRzEeRvTlRr"
   },
   "source": [
    "需要算法基础才能理解，初学者可参考[《自然语言处理入门》](http://nlp.hankcs.com/book.php)。\n",
    "#### 空格单词"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "含有空格、制表符等（Transformer tokenizer去掉的字符）的词语需要用`tuple`的形式提供："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['如何', '评价', 'iPad Pro', '？', 'iPad  Pro', '有', '2个空格']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tok.dict_combine = {('iPad', 'Pro'), '2个空格'}\n",
    "HanLP(\"如何评价iPad Pro ？iPad  Pro有2个空格\", tasks='tok/fine')['tok/fine']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "聪明的用户请继续阅读，`tuple`词典中的字符串其实等价于该字符串的所有可能的切分方式："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys([('2', '个', '空格'), ('2', '个', '空', '格'), ('2', '个空', '格'), ('2', '个空格'), ('2个', '空', '格'), ('2个', '空格'), ('2个空格',), ('iPad', 'Pro'), ('2个空', '格')])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dict(tok.dict_combine.config[\"dictionary\"]).keys()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 单词位置"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "HanLP支持输出每个单词在文本中的原始位置，以便用于搜索引擎等场景。在词法分析中，非语素字符（空格、换行、制表符等）会被剔除，此时需要额外的位置信息才能定位每个单词："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['2021 年', 0, 6], ['HanLPv2.1', 7, 16], ['为', 17, 18], ['生产', 18, 20], ['环境', 20, 22], ['带来', 22, 24], ['次', 24, 25], ['世代', 25, 27], ['最', 27, 28], ['先进', 28, 30], ['的', 30, 31], ['多', 31, 32], ['语种', 32, 34], ['NLP', 34, 37], ['技术', 37, 39], ['。', 39, 40]]\n"
     ]
    }
   ],
   "source": [
    "tok.config.output_spans = True\n",
    "sent = '2021 年\\nHanLPv2.1 为生产环境带来次世代最先进的多语种NLP技术。'\n",
    "word_offsets = HanLP(sent, tasks='tok/fine')['tok/fine']\n",
    "print(word_offsets)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "返回格式为三元组（单词，单词的起始下标，单词的终止下标），下标以字符级别计量。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "for word, begin, end in word_offsets:\n",
    "    assert word == sent[begin:end]"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "authorship_tag": "ABX9TyNRpO7rdchCK1UmB0nQmPrG",
   "collapsed_sections": [],
   "include_colab_link": true,
   "name": "tok_mtl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}

================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tok_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tok_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Ftok_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EmZDmLn9aGxG"
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 分词\n",
    "HanLP线上模型训练自`9970`万字的大型综合语料库，覆盖新闻、社交媒体、金融、法律等多个领域，是已知范围内**全世界最大**的中文分词语料库。语料库规模决定实际效果，面向生产环境的语料库应当在千万字量级。自然语义的语言学专家一直在持续标注该语料库，与时俱进保持最先进的分词质量。\n",
    "在分词标准上，HanLP提供细粒度和粗粒度两种颗粒度，细粒度适合搜索引擎业务，粗粒度适合文本挖掘业务。\n",
    "### 细粒度分词\n",
    "默认细粒度："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['商品', '和', '服务', '。'],\n",
       " ['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司', '。']]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司。')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "用户也可以直接将`HanLP`当作函数调用，并且打印漂亮的分词结果："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "6fbb3eac-df26-4a55-8ba9-975d6cede227"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">商品&nbsp;和&nbsp;服务&nbsp;。</pre></div><br><div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">阿婆主&nbsp;来到&nbsp;北京&nbsp;立方庭&nbsp;参观&nbsp;自然&nbsp;语义&nbsp;科技&nbsp;公司&nbsp;。</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "HanLP('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司。', tasks='tok').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "返回类型为[Document](https://hanlp.hankcs.com/docs/api/common/document.html)，是`dict`的子类，拓展了很多操作各种语言学结构的方法。\n",
    "\n",
    "两个接口都会对文本进行分句，所以返回的结果一定是句子的列表。推荐在不超过服务器允许的最大长度的前提下，尽量传入整篇文章，以提高分词速度。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "jj1Jk-2sPHYx"
   },
   "source": [
    "### 粗粒度分词\n",
    "执行粗颗粒度分词："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['商品', '和', '服务', '。'], ['阿婆主', '来到', '北京', '立方庭', '参观', '自然语义科技公司']]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司', coarse=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "或者直接当函数调用："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "id": "1goEC7znPNkI",
    "outputId": "ddf15a17-2f5d-4bc3-d145-908fb6176552"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">阿婆主&nbsp;来到&nbsp;北京&nbsp;立方庭&nbsp;参观&nbsp;自然语义科技公司&nbsp;。</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='tok/coarse').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "wxctCigrTKu-"
   },
   "source": [
    "### 同时执行细粒度和粗粒度分词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Zo08uquCTFSk",
    "outputId": "bf24a01a-a09b-4b78-fdec-2bb705b4becb"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'tok/fine': [['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司', '。']],\n",
       " 'tok/coarse': [['阿婆主', '来到', '北京', '立方庭', '参观', '自然语义科技公司', '。']]}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='tok*')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`fine`为细分，`coarse`为粗分。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 多语种分词\n",
    "得益于语言无关的设计，HanLP支持包括简繁中英日俄法德在内的104种语言上的分词。这一切，只需指定`language='mul'`即可实现。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">In&nbsp;2021&nbsp;,&nbsp;HanLPv2.1&nbsp;delivers&nbsp;state-of-the-art&nbsp;multilingual&nbsp;NLP&nbsp;techniques&nbsp;to&nbsp;production&nbsp;environments&nbsp;.</pre></div><br><div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">2021&nbsp;年&nbsp;、&nbsp;HanLPv2.1&nbsp;は&nbsp;次&nbsp;世代&nbsp;の&nbsp;最&nbsp;先端&nbsp;多&nbsp;言語&nbsp;NLP&nbsp;技術&nbsp;を&nbsp;本番&nbsp;環境&nbsp;に&nbsp;導入&nbsp;します&nbsp;。</pre></div><br><div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">2021&nbsp;年&nbsp;HanLPv2.1&nbsp;为&nbsp;生产&nbsp;环境&nbsp;带来&nbsp;次世代&nbsp;最&nbsp;先进的&nbsp;多&nbsp;语种&nbsp;NLP&nbsp;技术&nbsp;。</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "HanLP(['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.',\n",
    "       '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',\n",
    "       '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'], tasks='tok', language='mul').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "自然语言处理分为许多任务，分词只是最初级的一个。也许大家只听说过中文分词，但HanLP并不局限于分词。HanLP的使命是普及最前沿的自然语言处理技术到生产环境，所以在其他教程中你会见到许多更高级的NLP任务以及相应的API用法。"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "tok_restful.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tok_stl.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tok_stl.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Ftok_stl.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "4M7ka0K5OMWU",
    "outputId": "f931579a-f5a8-487a-a89e-33d5477584c3"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'SIGHAN2005_PKU_CONVSEG': 'https://file.hankcs.com/hanlp/tok/sighan2005-pku-convseg_20200110_153722.zip',\n",
       " 'SIGHAN2005_MSR_CONVSEG': 'https://file.hankcs.com/hanlp/tok/convseg-msr-nocrf-noembed_20200110_153524.zip',\n",
       " 'CTB6_CONVSEG': 'https://file.hankcs.com/hanlp/tok/ctb6_convseg_nowe_nocrf_20200110_004046.zip',\n",
       " 'PKU_NAME_MERGED_SIX_MONTHS_CONVSEG': 'https://file.hankcs.com/hanlp/tok/pku98_6m_conv_ngram_20200110_134736.zip',\n",
       " 'LARGE_ALBERT_BASE': 'https://file.hankcs.com/hanlp/tok/large_corpus_cws_albert_base_20211228_160926.zip',\n",
       " 'SIGHAN2005_PKU_BERT_BASE_ZH': 'https://file.hankcs.com/hanlp/tok/sighan2005_pku_bert_base_zh_20201231_141130.zip',\n",
       " 'COARSE_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/tok/coarse_electra_small_20220616_012050.zip',\n",
       " 'FINE_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/tok/fine_electra_small_20220615_231803.zip',\n",
       " 'CTB9_TOK_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/tok/ctb9_electra_small_20220215_205427.zip',\n",
       " 'CTB9_TOK_ELECTRA_BASE': 'http://download.hanlp.com/tok/extra/ctb9_tok_electra_base_20220426_111949.zip',\n",
       " 'CTB9_TOK_ELECTRA_BASE_CRF': 'http://download.hanlp.com/tok/extra/ctb9_tok_electra_base_crf_20220426_161255.zip',\n",
       " 'MSR_TOK_ELECTRA_BASE_CRF': 'http://download.hanlp.com/tok/extra/msra_crf_electra_base_20220507_113936.zip',\n",
       " 'UD_TOK_MMINILMV2L6': 'https://file.hankcs.com/hanlp/tok/ud_tok_mMiniLMv2L6_no_space_mul_20220619_091824.zip',\n",
       " 'UD_TOK_MMINILMV2L12': 'https://file.hankcs.com/hanlp/tok/ud_tok_mMiniLMv2L12_no_space_mul_20220619_091159.zip'}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.tok.ALL # 语种见名称最后一个字段或相应语料库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "BMW528wGNulM"
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存。自然语言处理分为许多任务，分词只是最初级的一个。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "0tmKBu7sNAXX",
    "outputId": "8977891f-9e64-4e39-8ce6-264a791541a3"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<hanlp.components.tokenizers.transformer.TransformerTaggingTokenizer at 0x10420e5b0>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)\n",
    "tok"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 进阶知识\n",
    "你可以通过加载不同的模型实现各种颗粒度、各种分词标准、各种领域的中文分词。其中，coarse和fine模型训练自`9970`万字的大型综合语料库，覆盖新闻、社交媒体、金融、法律等多个领域，是已知范围内**全世界最大**的中文分词语料库。语料库规模决定实际效果，面向生产环境的语料库应当在千万字量级。欢迎用户在自己的语料上[训练或微调模型](https://github.com/hankcs/HanLP/tree/master/plugins/hanlp_demo/hanlp_demo/zh/train)以适应新领域。语料库标注标准决定最终的分词标准，模型的准确率决定多大程度上再现该分词标准。更多背景知识请参考[《自然语言处理入门》](http://nlp.hankcs.com/book.php)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "KYH1oEKkctuy"
   },
   "source": [
    "## 执行分词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "uzex--zFcqKB",
    "outputId": "a4db6808-1039-4803-84af-2687cce0fa7b"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['商品', '和', '服务', '。'], ['晓美焰', '来到', '北京立方庭', '参观', '自然语义科技公司']]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tok(['商品和服务。', '晓美焰来到北京立方庭参观自然语义科技公司'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 细分标准"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "你可以通过加载`FINE_ELECTRA_SMALL_ZH`模型实现细粒度中文分词："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "tok_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "无论哪个模型，分词器的接口是完全一致的："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['晓美焰', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tok_fine('晓美焰来到北京立方庭参观自然语义科技公司')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 无限长度\n",
    "众所周知，Transformer的输入有长度限制（通常是512）。幸运地是，HanLP的滑动窗口技巧完美地突破了该限制。只要你的内存（显存）足够，HanLP就可以处理无限长的句子。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 并行分词\n",
    "无论是CPU还是GPU，同时传入多个句子都将并行分词。也就是说，仅花费1个句子的时间可以处理多个句子。然而工作研究中的文本通常是一篇文档，而不是许多句子。此时可以利用HanLP提供的分句功能和流水线模式优雅应对，既能处理长文本又能并行化。只需创建一个流水线`pipeline`，第一级管道分句，第二级管道分词："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['量体裁衣', '，', 'HanLP', '提供', 'RESTful', '和', 'native', '两种', 'API', '。'],\n",
       " ['两者', '在', '语义', '上', '保持', '一致', '，', '在', '代码', '上', '坚持', '开源', '。']]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP = hanlp.pipeline() \\\n",
    "    .append(hanlp.utils.rules.split_sentence) \\\n",
    "    .append(tok)\n",
    "HanLP('量体裁衣，HanLP提供RESTful和native两种API。两者在语义上保持一致，在代码上坚持开源。')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "返回结果是每个句子的分词`list`，如果要将它们合并到一个`list`里该怎么办呢？聪明的用户可能已经想到了，再加一级`lambda`管道："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['量体裁衣', '，', 'HanLP', '提供', 'RESTful', '和', 'native', '两种', 'API', '。', '两者', '在', '语义', '上', '保持', '一致', '，', '在', '代码', '上', '坚持', '开源', '。']\n"
     ]
    }
   ],
   "source": [
    "HanLP.append(lambda sents: sum(sents, []))\n",
    "print(HanLP('量体裁衣，HanLP提供RESTful和native两种API。两者在语义上保持一致，在代码上坚持开源。'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "suUL042zPpLj"
   },
   "source": [
    "## 自定义词典"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "1q4MUpgVQNlu"
   },
   "source": [
    "智者千虑，必有一失。模型偶尔也会犯错误，比如某个旧版本模型在不挂词典时会犯以下错误（最新版已经修复）："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "2zZkH9tRQOoi",
    "outputId": "a74db6c6-0a71-411c-de78-60621a43eded",
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['首相', '和', '川', '普通', '电话']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tok = hanlp.load('https://file.hankcs.com/hanlp/tok/coarse_electra_small_20220220_013548.zip')\n",
    "tok.dict_force = tok.dict_combine = None\n",
    "tok(\"首相和川普通电话\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "上面分词任务两个成员变量`dict_force`和`dict_combine`为自定义词典："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "id": "AzYShIssP6kq",
    "outputId": "ce3bb1aa-5042-47d7-8ac9-7ed0fd478c77"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(None, None)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tok.dict_combine, tok.dict_force"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "HanLP支持合并和强制两种优先级的自定义词典，以满足不同场景的需求。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "F-9gAeIVQUFG"
   },
   "source": [
    "### 强制模式\n",
    "强制模式`dict_force`优先输出正向最长匹配到的自定义词条，在这个案例中，用户的第一反应也许是将`川普`加入到`dict_force`中，强制分词器输出`川普`："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "F8M8cyBrQduw",
    "outputId": "c156513c-d13c-47f1-bc3a-c73a8649ddb1"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['首相', '和', '川普', '通', '电话'],\n",
       " ['银', '川普', '通人', '与', '川普', '通', '电话', '讲', '四', '川普', '通话']]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tok.dict_force = {'川普'}\n",
    "tok([\"首相和川普通电话\", \"银川普通人与川普通电话讲四川普通话\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "DDqQxqQaTayv"
   },
   "source": [
    "然而与大众的朴素认知不同，词典优先级最高未必是好事。极有可能匹配到不该分出来的自定义词语，导致歧义。即便是将`普通人`或`普通话`加入到词典中也无济于事，因为在正向最长匹配第二个句子的过程中，会匹配到`川普`而不会匹配后两者。这也解释了为什么自定义词典中存在的词可能分不出来：当歧义发生时，两个词语发生交叉冲突，自然有所取舍，无法同时输出两者。那种同时输出句子或长单词中所有可能的单词，并且允许单词交叉的算法，并非分词，而是多模式字符串匹配。你需要基本的算法知识才能理解这一点，总之一般情况下应当慎用强制模式，详见[《自然语言处理入门》](http://nlp.hankcs.com/book.php)第二章。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "自定义词语越长，越不容易发生歧义。这启发我们将强制模式拓展为强制校正功能。强制校正原理相似，但会将匹配到的自定义词条替换为相应的分词结果:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "bjnEqDaATdVr",
    "outputId": "2e694aed-a71f-4a28-d981-0767d9e263e9"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['首相', '和', '川普', '通', '电话'],\n",
       " ['银川', '普通人', '与', '川普', '通', '电话', '讲', '四川', '普通话']]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tok.dict_force = {'川普通电话': ['川普', '通', '电话']}\n",
    "tok([\"首相和川普通电话\", \"银川普通人与川普通电话讲四川普通话\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "强制校正是一种短平快的规则补丁，需要针对每种可能产生歧义的语境，截取一个片段执行校正。当你积累了很多歧义片段与相应的校正补丁后，其实就应该考虑微调模型。微调可以让模型增量式学习这些歧义语境，摆脱对补丁规则的依赖，同时举一反三应对新的语境。从错误中积累经验，用经验预测未来，这就是机器学习与人工智能的魅力。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "事实上，“川普通电话”这种例子不需要词典即可分对。只需提供给神经网络足够的上下文线索（这也是真实文本所具备的），告诉神经网络“川普是美国总统”："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['首相', '和', '川普', '通', '电话', '，', '川普', '是', '美国', '总统', '。'], ['银川', '普通人', '与', '川普', '通', '电话', '讲', '四川', '普通话', '，', '川普', '是', '美国', '总统', '。']]\n"
     ]
    }
   ],
   "source": [
    "tok.dict_force = tok.dict_combine = None\n",
    "print(tok([\"首相和川普通电话，川普是美国总统。\", \"银川普通人与川普通电话讲四川普通话，川普是美国总统。\"]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "9aRzEeRvTlRr"
   },
   "source": [
    "在上面的例子中，虽然词典对“川普”没有施加任何影响，但是更丰富的上下文促进了神经网络对语境的理解，使其得出了正确的结果。深度学习中的神经网络似乎展示了些许智能，感兴趣的初学者可参考[《自然语言处理入门》](http://nlp.hankcs.com/book.php)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "ldKAnVoSTgxb"
   },
   "source": [
    "### 合并模式\n",
    "合并模式的优先级低于统计模型，即`dict_combine`会在统计模型的分词结果上执行最长匹配并合并匹配到的词条。一般情况下，推荐使用该模式。比如，将“美国总统”加入`dict_combine`后会合并`['美国', '总统']`，而不会合并`['美国', '总', '统筹部']`为`['美国总统', '筹部']`："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "bwIu0f6wTgbF",
    "outputId": "22807b6a-3472-431b-d1e3-95f6b761c84c"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['首相', '和', '川普', '通', '电话', '，', '川普', '是', '美国总统', '。'], ['银川', '普通人', '与', '川普', '通', '电话', '讲', '四川', '普通话', '，', '川普', '是', '美国总统', '。'], ['美国', '总统筹部', '部长', '是', '谁', '？']]\n"
     ]
    }
   ],
   "source": [
    "tok.dict_force = None\n",
    "tok.dict_combine = {'美国总统'}\n",
    "print(tok([\"首相和川普通电话，川普是美国总统。\", \"银川普通人与川普通电话讲四川普通话，川普是美国总统。\", \"美国总统筹部部长是谁？\"]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 空格单词"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "含有空格、制表符等（Transformer tokenizer去掉的字符）的词语需要用`tuple`的形式提供："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['如何', '评价', 'iPad Pro', '？', 'iPad  Pro', '有', '2个空格']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tok.dict_combine = {('iPad', 'Pro'), '2个空格'}\n",
    "tok(\"如何评价iPad Pro ？iPad  Pro有2个空格\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "聪明的用户请继续阅读，`tuple`词典中的字符串其实等价于该字符串的所有可能的切分方式："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys([('iPad', 'Pro'), ('2个空格',), ('2', '个', '空格'), ('2', '个', '空', '格'), ('2', '个空格'), ('2', '个空', '格'), ('2个', '空', '格'), ('2个', '空格'), ('2个空', '格')])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dict(tok.dict_combine.config[\"dictionary\"]).keys()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 单词位置"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "HanLP支持输出每个单词在文本中的原始位置，以便用于搜索引擎等场景。在词法分析中，非语素字符（空格、换行、制表符等）会被剔除，此时需要额外的位置信息才能定位每个单词："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['2021', 0, 4], ['年', 5, 6], ['HanLPv2.1', 7, 16], ['为', 17, 18], ['生产', 18, 20], ['环境', 20, 22], ['带来', 22, 24], ['次', 24, 25], ['世代', 25, 27], ['最', 27, 28], ['先进', 28, 30], ['的', 30, 31], ['多', 31, 32], ['语种', 32, 34], ['NLP', 34, 37], ['技术', 37, 39], ['。', 39, 40]]\n"
     ]
    }
   ],
   "source": [
    "tok.config.output_spans = True\n",
    "sent = '2021 年\\nHanLPv2.1 为生产环境带来次世代最先进的多语种NLP技术。'\n",
    "word_offsets = tok(sent)\n",
    "print(word_offsets)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "返回格式为三元组（单词，单词的起始下标，单词的终止下标），下标以字符级别计量。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "for word, begin, end in word_offsets:\n",
    "    assert word == sent[begin:end]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 多语种支持"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "得益于语言无关的设计，以及大规模多语种语料库，最近HanLP发布了支持[130种语言](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/tok.html#hanlp.pretrained.tok.UD_TOK_MMINILMV2L12)的单任务分词器。用法与中文分词器相同："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "mul = hanlp.load(hanlp.pretrained.tok.UD_TOK_MMINILMV2L6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['In', '2021', ',', 'HanLPv2.1', 'delivers', 'state-of-the-art', 'multilingual', 'NLP', 'techniques', 'to', 'production', 'environments', '.'], ['2021年', '、', 'HanLPv2.1', 'は', '次世代', 'の', '最', '先端', '多', '言語', 'NLP', '技術', 'を', '本番', '環境', 'に', '導入', 'し', 'ます', '。'], ['2021年', 'HanLPv2.1', '为', '生产', '环境', '带来', '次', '世代', '最', '先进', '的', '多语种', 'NLP', '技术', '。'], ['奈須きのこ', 'は', '1973年', '11月', '28日', 'に', '千葉', '県', '円空山', 'で', '生まれ', '、', 'ゲーム', '制作', '会社', '「', 'ノーツ', '」', 'の', '設立', '者', 'だ', '。']]\n"
     ]
    }
   ],
   "source": [
    "print(mul([\n",
    "    'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.',\n",
    "    '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',\n",
    "    '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。',\n",
    "    '奈須きのこは1973年11月28日に千葉県円空山で生まれ、ゲーム制作会社「ノーツ」の設立者だ。'\n",
    "]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "目前，多语种分词器的效果并不如单语种好。欢迎在你自己的单语种语料上自行训练新模型，也欢迎开源你的语料和模型。"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "authorship_tag": "ABX9TyPxXzYAXgLUW5uKV7v0/2iP",
   "collapsed_sections": [],
   "name": "tok_stl.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/train/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-31 20:12


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/train/finetune_ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2023-10-18 18:49
import os

import hanlp
from hanlp.components.ner.transformer_ner import TransformerNamedEntityRecognizer
from tests import cdroot

cdroot()

your_training_corpus = 'data/ner/finetune/word_to_iobes.tsv'
your_development_corpus = your_training_corpus  # Use a different one in reality
save_dir = 'data/ner/finetune/model'

if not os.path.exists(your_training_corpus):
    os.makedirs(os.path.dirname(your_training_corpus), exist_ok=True)
    with open(your_training_corpus, 'w') as out:
        out.write(
'''训练\tB-NLP
语料\tE-NLP
为\tO
IOBES\tO
格式\tO
'''
        )

ner = TransformerNamedEntityRecognizer()
if not os.path.exists(save_dir):
    print('Start fine-tuning ')
    ner.fit(
        trn_data=your_training_corpus,
        dev_data=your_development_corpus,
        save_dir=save_dir,
        epochs=50,  # Since the corpus is small, overfit it
        finetune=hanlp.pretrained.ner.MSRA_NER_ELECTRA_SMALL_ZH,
        # You MUST set the same parameters with the fine-tuning model:
        average_subwords=True,
        transformer='hfl/chinese-electra-180g-small-discriminator',
    )
else:
    print('Load fine-tuned model')
    ner = hanlp.load(save_dir)

HanLP = hanlp.pipeline()\
    .append(hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH), output_key='tok')\
    .append(ner, output_key='ner')
HanLP(['训练语料为IOBES格式', '晓美焰来到北京立方庭参观自然语义科技公司。']).pretty_print()


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/train/open_base.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-03 14:24
from hanlp_demo import block_windows
from hanlp.common.dataset import SortingSamplerBuilder
from hanlp.common.transform import NormalizeCharacter
from hanlp.components.mtl.multi_task_learning import MultiTaskLearning
from hanlp.components.mtl.tasks.constituency import CRFConstituencyParsing
from hanlp.components.mtl.tasks.dep import BiaffineDependencyParsing
from hanlp.components.mtl.tasks.ner.tag_ner import TaggingNamedEntityRecognition
from hanlp.components.mtl.tasks.pos import TransformerTagging
from hanlp.components.mtl.tasks.sdp import BiaffineSemanticDependencyParsing
from hanlp.components.mtl.tasks.srl.bio_srl import SpanBIOSemanticRoleLabeling
from hanlp.components.mtl.tasks.tok.tag_tok import TaggingTokenization
from hanlp.datasets.ner.msra import MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TRAIN, MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_DEV, \
    MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TEST
from hanlp.datasets.parsing.ctb8 import CTB8_POS_TRAIN, CTB8_POS_DEV, CTB8_POS_TEST, CTB8_SD330_TEST, CTB8_SD330_DEV, \
    CTB8_SD330_TRAIN, CTB8_CWS_TRAIN, CTB8_CWS_DEV, CTB8_CWS_TEST, CTB8_BRACKET_LINE_NOEC_TRAIN, \
    CTB8_BRACKET_LINE_NOEC_DEV, CTB8_BRACKET_LINE_NOEC_TEST
from hanlp.datasets.parsing.semeval16 import SEMEVAL2016_TEXT_TRAIN_CONLLU, SEMEVAL2016_TEXT_TEST_CONLLU, \
    SEMEVAL2016_TEXT_DEV_CONLLU
from hanlp.datasets.srl.ontonotes5.chinese import ONTONOTES5_CONLL12_CHINESE_TEST, ONTONOTES5_CONLL12_CHINESE_DEV, \
    ONTONOTES5_CONLL12_CHINESE_TRAIN
from hanlp.layers.embeddings.contextual_word_embedding import ContextualWordEmbedding
from hanlp.layers.transformers.relative_transformer import RelativeTransformerEncoder
from hanlp.utils.lang.zh.char_table import HANLP_CHAR_TABLE_JSON
from hanlp.utils.log_util import cprint
from tests import cdroot

cdroot()
tasks = {
    'tok': TaggingTokenization(
        CTB8_CWS_TRAIN,
        CTB8_CWS_DEV,
        CTB8_CWS_TEST,
        SortingSamplerBuilder(batch_size=32),
        max_seq_len=510,
        hard_constraint=True,
        char_level=True,
        tagging_scheme='BMES',
        lr=1e-3,
        transform=NormalizeCharacter(HANLP_CHAR_TABLE_JSON, 'token'),
    ),
    'pos': TransformerTagging(
        CTB8_POS_TRAIN,
        CTB8_POS_DEV,
        CTB8_POS_TEST,
        SortingSamplerBuilder(batch_size=32),
        hard_constraint=True,
        max_seq_len=510,
        char_level=True,
        dependencies='tok',
        lr=1e-3,
    ),
    'ner': TaggingNamedEntityRecognition(
        MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TRAIN,
        MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_DEV,
        MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TEST,
        SortingSamplerBuilder(batch_size=32),
        lr=1e-3,
        secondary_encoder=RelativeTransformerEncoder(768, k_as_x=True),
        dependencies='tok',
    ),
    'srl': SpanBIOSemanticRoleLabeling(
        ONTONOTES5_CONLL12_CHINESE_TRAIN,
        ONTONOTES5_CONLL12_CHINESE_DEV,
        ONTONOTES5_CONLL12_CHINESE_TEST,
        SortingSamplerBuilder(batch_size=32, batch_max_tokens=2048),
        lr=1e-3,
        crf=True,
        dependencies='tok',
    ),
    'dep': BiaffineDependencyParsing(
        CTB8_SD330_TRAIN,
        CTB8_SD330_DEV,
        CTB8_SD330_TEST,
        SortingSamplerBuilder(batch_size=32),
        lr=1e-3,
        tree=True,
        punct=True,
        dependencies='tok',
    ),
    'sdp': BiaffineSemanticDependencyParsing(
        SEMEVAL2016_TEXT_TRAIN_CONLLU,
        SEMEVAL2016_TEXT_DEV_CONLLU,
        SEMEVAL2016_TEXT_TEST_CONLLU,
        SortingSamplerBuilder(batch_size=32),
        lr=1e-3,
        apply_constraint=True,
        punct=True,
        dependencies='tok',
    ),
    'con': CRFConstituencyParsing(
        CTB8_BRACKET_LINE_NOEC_TRAIN,
        CTB8_BRACKET_LINE_NOEC_DEV,
        CTB8_BRACKET_LINE_NOEC_TEST,
        SortingSamplerBuilder(batch_size=32),
        lr=1e-3,
        dependencies='tok',
    )
}
mtl = MultiTaskLearning()
save_dir = 'data/model/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base'
mtl.fit(
    ContextualWordEmbedding('token',
                            "hfl/chinese-electra-180g-base-discriminator",
                            average_subwords=True,
                            max_sequence_length=512,
                            word_dropout=.1),
    tasks,
    save_dir,
    30,
    lr=1e-3,
    encoder_lr=5e-5,
    grad_norm=1,
    gradient_accumulation=2,
    eval_trn=False,
)
cprint(f'Model saved in [cyan]{save_dir}[/cyan]')
mtl.evaluate(save_dir)
mtl.load(save_dir)
print(mtl('华纳音乐旗下的新垣结衣在12月21日于日本武道馆举办歌手出道活动'))


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/train/open_small.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-03 14:24
from hanlp_demo import block_windows
from hanlp.common.dataset import SortingSamplerBuilder
from hanlp.common.transform import NormalizeCharacter
from hanlp.components.mtl.multi_task_learning import MultiTaskLearning
from hanlp.components.mtl.tasks.constituency import CRFConstituencyParsing
from hanlp.components.mtl.tasks.dep import BiaffineDependencyParsing
from hanlp.components.mtl.tasks.ner.tag_ner import TaggingNamedEntityRecognition
from hanlp.components.mtl.tasks.pos import TransformerTagging
from hanlp.components.mtl.tasks.sdp import BiaffineSemanticDependencyParsing
from hanlp.components.mtl.tasks.srl.bio_srl import SpanBIOSemanticRoleLabeling
from hanlp.components.mtl.tasks.tok.tag_tok import TaggingTokenization
from hanlp.datasets.ner.msra import MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TEST, MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_DEV, \
    MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TRAIN
from hanlp.datasets.parsing.ctb8 import CTB8_POS_TRAIN, CTB8_POS_DEV, CTB8_POS_TEST, CTB8_SD330_TEST, CTB8_SD330_DEV, \
    CTB8_SD330_TRAIN, CTB8_CWS_TRAIN, CTB8_CWS_DEV, CTB8_CWS_TEST, CTB8_BRACKET_LINE_NOEC_TEST, \
    CTB8_BRACKET_LINE_NOEC_DEV, CTB8_BRACKET_LINE_NOEC_TRAIN
from hanlp.datasets.parsing.semeval16 import SEMEVAL2016_TEXT_TRAIN_CONLLU, SEMEVAL2016_TEXT_TEST_CONLLU, \
    SEMEVAL2016_TEXT_DEV_CONLLU
from hanlp.datasets.srl.ontonotes5.chinese import ONTONOTES5_CONLL12_CHINESE_TEST, ONTONOTES5_CONLL12_CHINESE_DEV, \
    ONTONOTES5_CONLL12_CHINESE_TRAIN
from hanlp.layers.embeddings.contextual_word_embedding import ContextualWordEmbedding
from hanlp.layers.transformers.relative_transformer import RelativeTransformerEncoder
from hanlp.utils.lang.zh.char_table import HANLP_CHAR_TABLE_JSON
from hanlp.utils.log_util import cprint
from tests import cdroot

cdroot()
tasks = {
    'tok': TaggingTokenization(
        CTB8_CWS_TRAIN,
        CTB8_CWS_DEV,
        CTB8_CWS_TEST,
        SortingSamplerBuilder(batch_size=32),
        max_seq_len=510,
        hard_constraint=True,
        char_level=True,
        tagging_scheme='BMES',
        lr=1e-3,
        transform=NormalizeCharacter(HANLP_CHAR_TABLE_JSON, 'token'),
    ),
    'pos': TransformerTagging(
        CTB8_POS_TRAIN,
        CTB8_POS_DEV,
        CTB8_POS_TEST,
        SortingSamplerBuilder(batch_size=32),
        hard_constraint=True,
        max_seq_len=510,
        char_level=True,
        dependencies='tok',
        lr=1e-3,
    ),
    'ner': TaggingNamedEntityRecognition(
        MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TRAIN,
        MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_DEV,
        MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TEST,
        SortingSamplerBuilder(batch_size=32),
        max_seq_len=510,
        hard_constraint=True,
        char_level=True,
        lr=1e-3,
        secondary_encoder=RelativeTransformerEncoder(256, k_as_x=True, feedforward_dim=128),
        dependencies='tok',
    ),
    'srl': SpanBIOSemanticRoleLabeling(
        ONTONOTES5_CONLL12_CHINESE_TRAIN,
        ONTONOTES5_CONLL12_CHINESE_DEV,
        ONTONOTES5_CONLL12_CHINESE_TEST,
        SortingSamplerBuilder(batch_size=32, batch_max_tokens=1280),
        lr=1e-3,
        crf=True,
        dependencies='tok',
    ),
    'dep': BiaffineDependencyParsing(
        CTB8_SD330_TRAIN,
        CTB8_SD330_DEV,
        CTB8_SD330_TEST,
        SortingSamplerBuilder(batch_size=32),
        lr=1e-3,
        tree=True,
        proj=True,
        punct=True,
        dependencies='tok',
    ),
    'sdp': BiaffineSemanticDependencyParsing(
        SEMEVAL2016_TEXT_TRAIN_CONLLU,
        SEMEVAL2016_TEXT_DEV_CONLLU,
        SEMEVAL2016_TEXT_TEST_CONLLU,
        SortingSamplerBuilder(batch_size=32),
        lr=1e-3,
        apply_constraint=True,
        punct=True,
        dependencies='tok',
    ),
    'con': CRFConstituencyParsing(
        CTB8_BRACKET_LINE_NOEC_TRAIN,
        CTB8_BRACKET_LINE_NOEC_DEV,
        CTB8_BRACKET_LINE_NOEC_TEST,
        SortingSamplerBuilder(batch_size=32),
        lr=1e-3,
        dependencies='tok',
    )
}
mtl = MultiTaskLearning()
save_dir = 'data/model/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small'
cprint(f'Model will be saved in [cyan]{save_dir}[/cyan]')
mtl.fit(
    ContextualWordEmbedding('token',
                            "hfl/chinese-electra-180g-small-discriminator",
                            average_subwords=True,
                            max_sequence_length=512,
                            word_dropout=.1),
    tasks,
    save_dir,
    30,
    lr=1e-3,
    encoder_lr=5e-5,
    grad_norm=1,
    gradient_accumulation=1,
    eval_trn=False,
)
cprint(f'Model saved in [cyan]{save_dir}[/cyan]')
mtl.evaluate(save_dir)
mtl.load(save_dir)
mtl('华纳音乐旗下的新垣结衣在12月21日于日本武道馆举办歌手出道活动').pretty_print()


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/train_sota_bert_pku.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-11 02:47
from hanlp.common.dataset import SortingSamplerBuilder
from hanlp.components.tokenizers.transformer import TransformerTaggingTokenizer
from hanlp.datasets.tokenization.sighan2005.pku import SIGHAN2005_PKU_TRAIN_ALL, SIGHAN2005_PKU_TEST
from tests import cdroot

cdroot()
tokenizer = TransformerTaggingTokenizer()
save_dir = 'data/model/cws/sighan2005_pku_bert_base_96.7'
tokenizer.fit(
    SIGHAN2005_PKU_TRAIN_ALL,
    SIGHAN2005_PKU_TEST,  # Conventionally, no devset is used. See Tian et al. (2020).
    save_dir,
    'bert-base-chinese',
    max_seq_len=300,
    char_level=True,
    hard_constraint=True,
    sampler_builder=SortingSamplerBuilder(batch_size=32),
    epochs=3,
    adam_epsilon=1e-6,
    warmup_steps=0.1,
    weight_decay=0.01,
    word_dropout=0.1,
    seed=1660853059,
)
tokenizer.evaluate(SIGHAN2005_PKU_TEST, save_dir)
print(f'Model saved in {save_dir}')


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tst_restful.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WfGpInivS0fG"
   },
   "source": [
    "<h2 align=\"center\">点击下列图标在线运行HanLP</h2>\n",
    "<div align=\"center\">\n",
    "\t<a href=\"https://colab.research.google.com/github/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tst_restful.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "\t<a href=\"https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Ftst_restful.ipynb\" target=\"_blank\"><img src=\"https://mybinder.org/badge_logo.svg\" alt=\"Open In Binder\"/></a>\n",
    "</div>\n",
    "\n",
    "## 安装"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "IYwV-UkNNzFp"
   },
   "source": [
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1Uf_u7ddMhUt"
   },
   "outputs": [],
   "source": [
    "!pip install hanlp_restful -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pp-1KqEOOJ4t"
   },
   "source": [
    "## 创建客户端"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "0tmKBu7sNAXX"
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EmZDmLn9aGxG"
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "elA_UyssOut_"
   },
   "source": [
    "## 文本风格转换\n",
    "输入短文本以及目标风格，执行文本风格转换："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 70
    },
    "id": "BqEmDMGGOtk3",
    "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['国家对中石油寄予巨大期望。', '要用创新推动高质量发展。']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HanLP.text_style_transfer(['国家对中石油抱有很大的期望.', '要用创新去推动高质量的发展。'],\n",
    "                          target_style='gov_doc')"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "tst_restful.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/tutorial.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "BZPSH4VkK7J2"
   },
   "source": [
    "欢迎来到HanLP在线交互环境，这是一个Jupyter记事本，可以输入任意Python代码并在线执行。请点击左上角【Run】来运行这篇NLP教程。\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "XxPAiNwSK7J4"
   },
   "source": [
    "## 安装\n",
    "量体裁衣，HanLP提供**RESTful**（云端）和**native**（本地）两种API，分别面向轻量级和海量级两种场景。无论何种API何种语言，HanLP接口在语义上保持一致，你可以**任选一种**API来运行本教程。\n",
    "\n",
    "### 轻量级RESTful API\n",
    "\n",
    "仅数KB，适合敏捷开发、移动APP等场景。简单易用，无需GPU配环境，**强烈推荐**，秒速安装：\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "lgMa4kbfK7J5",
    "outputId": "5bb662d8-1665-4bcc-c517-70d1c4bc4837"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: hanlp_restful in /usr/local/lib/python3.7/dist-packages (0.0.7)\n",
      "Requirement already satisfied: hanlp-common in /usr/local/lib/python3.7/dist-packages (from hanlp_restful) (0.0.9)\n",
      "Requirement already satisfied: phrasetree in /usr/local/lib/python3.7/dist-packages (from hanlp-common->hanlp_restful) (0.0.8)\n"
     ]
    }
   ],
   "source": [
    "!pip install hanlp_restful"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "N4G6GbNmK7J6"
   },
   "source": [
    "创建客户端，填入服务器地址："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "3XM9-3-oK7J6"
   },
   "outputs": [],
   "source": [
    "from hanlp_restful import HanLPClient\n",
    "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名，zh中文，mul多语种"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pbeFH9jmK7J7"
   },
   "source": [
    "调用`parse`接口，传入一篇文章，得到HanLP精准的分析结果。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "mNJPvZ_3K7J7",
    "outputId": "4048d0d6-2dad-4582-e327-f99338f8f72b"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"tok/fine\": [\n",
      "    [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n",
      "    [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]\n",
      "  ],\n",
      "  \"tok/coarse\": [\n",
      "    [\"2021年\", \"HanLPv2.1\", \"为\", \"生产环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
      "    [\"阿婆主\", \"来到\", \"北京立方庭\", \"参观\", \"自然语义科技公司\", \"。\"]\n",
      "  ],\n",
      "  \"pos/ctb\": [\n",
      "    [\"NT\", \"NR\", \"P\", \"NN\", \"NN\", \"VV\", \"JJ\", \"NN\", \"AD\", \"JJ\", \"DEG\", \"CD\", \"NN\", \"NR\", \"NN\", \"PU\"],\n",
      "    [\"NN\", \"VV\", \"NR\", \"NR\", \"VV\", \"NN\", \"NN\", \"NN\", \"NN\", \"PU\"]\n",
      "  ],\n",
      "  \"pos/pku\": [\n",
      "    [\"t\", \"nx\", \"p\", \"vn\", \"n\", \"v\", \"b\", \"n\", \"d\", \"a\", \"u\", \"a\", \"n\", \"nx\", \"n\", \"w\"],\n",
      "    [\"n\", \"v\", \"ns\", \"ns\", \"v\", \"n\", \"n\", \"n\", \"n\", \"w\"]\n",
      "  ],\n",
      "  \"pos/863\": [\n",
      "    [\"nt\", \"w\", \"p\", \"v\", \"n\", \"v\", \"a\", \"nt\", \"d\", \"a\", \"u\", \"a\", \"n\", \"ws\", \"n\", \"w\"],\n",
      "    [\"n\", \"v\", \"ns\", \"n\", \"v\", \"n\", \"n\", \"n\", \"n\", \"w\"]\n",
      "  ],\n",
      "  \"ner/msra\": [\n",
      "    [[\"2021年\", \"DATE\", 0, 1], [\"HanLPv2.1\", \"ORGANIZATION\", 1, 2]],\n",
      "    [[\"北京立方庭\", \"LOCATION\", 2, 4], [\"自然语义科技公司\", \"ORGANIZATION\", 5, 9]]\n",
      "  ],\n",
      "  \"ner/pku\": [\n",
      "    [],\n",
      "    [[\"北京立方庭\", \"ns\", 2, 4], [\"自然语义科技公司\", \"nt\", 5, 9]]\n",
      "  ],\n",
      "  \"ner/ontonotes\": [\n",
      "    [[\"2021年\", \"DATE\", 0, 1], [\"HanLPv2.1\", \"ORG\", 1, 2]],\n",
      "    [[\"北京立方庭\", \"FAC\", 2, 4], [\"自然语义科技公司\", \"ORG\", 5, 9]]\n",
      "  ],\n",
      "  \"srl\": [\n",
      "    [[[\"2021年\", \"ARGM-TMP\", 0, 1], [\"HanLPv2.1\", \"ARG0\", 1, 2], [\"为生产环境\", \"ARG2\", 2, 5], [\"带来\", \"PRED\", 5, 6], [\"次世代最先进的多语种NLP技术\", \"ARG1\", 6, 15]], [[\"最\", \"ARGM-ADV\", 8, 9], [\"先进\", \"PRED\", 9, 10], [\"技术\", \"ARG0\", 14, 15]]],\n",
      "    [[[\"阿婆主\", \"ARG0\", 0, 1], [\"来到\", \"PRED\", 1, 2], [\"北京立方庭\", \"ARG1\", 2, 4]], [[\"阿婆主\", \"ARG0\", 0, 1], [\"参观\", \"PRED\", 4, 5], [\"自然语义科技公司\", \"ARG1\", 5, 9]]]\n",
      "  ],\n",
      "  \"dep\": [\n",
      "    [[6, \"tmod\"], [6, \"nsubj\"], [6, \"prep\"], [5, \"nn\"], [3, \"pobj\"], [0, \"root\"], [8, \"amod\"], [15, \"nn\"], [10, \"advmod\"], [15, \"rcmod\"], [10, \"assm\"], [13, \"nummod\"], [15, \"nn\"], [15, \"nn\"], [6, \"dobj\"], [6, \"punct\"]],\n",
      "    [[2, \"nsubj\"], [0, \"root\"], [4, \"nn\"], [2, \"dobj\"], [2, \"conj\"], [9, \"nn\"], [9, \"nn\"], [9, \"nn\"], [5, \"dobj\"], [2, \"punct\"]]\n",
      "  ],\n",
      "  \"sdp\": [\n",
      "    [[[6, \"Time\"]], [[6, \"Exp\"]], [[5, \"mPrep\"]], [[5, \"Desc\"]], [[6, \"Datv\"]], [[13, \"dDesc\"]], [[0, \"Root\"], [8, \"Desc\"], [13, \"Desc\"]], [[15, \"Time\"]], [[10, \"mDegr\"]], [[15, \"Desc\"]], [[10, \"mAux\"]], [[8, \"Quan\"], [13, \"Quan\"]], [[15, \"Desc\"]], [[15, \"Nmod\"]], [[6, \"Pat\"]], [[6, \"mPunc\"]]],\n",
      "    [[[2, \"Agt\"], [5, \"Agt\"]], [[0, \"Root\"]], [[4, \"Loc\"]], [[2, \"Lfin\"]], [[2, \"ePurp\"]], [[8, \"Nmod\"]], [[9, \"Nmod\"]], [[9, \"Nmod\"]], [[5, \"Datv\"]], [[5, \"mPunc\"]]]\n",
      "  ],\n",
      "  \"con\": [\n",
      "    [\"TOP\", [[\"IP\", [[\"NP\", [[\"NT\", [\"2021年\"]]]], [\"NP\", [[\"NR\", [\"HanLPv2.1\"]]]], [\"VP\", [[\"PP\", [[\"P\", [\"为\"]], [\"NP\", [[\"NN\", [\"生产\"]], [\"NN\", [\"环境\"]]]]]], [\"VP\", [[\"VV\", [\"带来\"]], [\"NP\", [[\"ADJP\", [[\"NP\", [[\"ADJP\", [[\"JJ\", [\"次\"]]]], [\"NP\", [[\"NN\", [\"世代\"]]]]]], [\"ADVP\", [[\"AD\", [\"最\"]]]], [\"VP\", [[\"JJ\", [\"先进\"]]]]]], [\"DEG\", [\"的\"]], [\"NP\", [[\"QP\", [[\"CD\", [\"多\"]]]], [\"NP\", [[\"NN\", [\"语种\"]]]]]], [\"NP\", [[\"NR\", [\"NLP\"]], [\"NN\", [\"技术\"]]]]]]]]]], [\"PU\", [\"。\"]]]]]],\n",
      "    [\"TOP\", [[\"IP\", [[\"NP\", [[\"NN\", [\"阿婆主\"]]]], [\"VP\", [[\"VP\", [[\"VV\", [\"来到\"]], [\"NP\", [[\"NR\", [\"北京\"]], [\"NR\", [\"立方庭\"]]]]]], [\"VP\", [[\"VV\", [\"参观\"]], [\"NP\", [[\"NN\", [\"自然\"]], [\"NN\", [\"语义\"]], [\"NN\", [\"科技\"]], [\"NN\", [\"公司\"]]]]]]]], [\"PU\", [\"。\"]]]]]]\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "doc = HanLP.parse(\"2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。\")\n",
    "print(doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "w4E8Kn_nK7J8"
   },
   "source": [
    "#### 可视化\n",
    "输出结果是一个可以`json`化的`dict`，键为[NLP任务名](https://hanlp.hankcs.com/docs/data_format.html#naming-convention)，值为分析结果。关于标注集含义，请参考[《语言学标注规范》](https://hanlp.hankcs.com/docs/annotations/index.html)及[《格式规范》](https://hanlp.hankcs.com/docs/data_format.html)。我们购买、标注或采用了世界上量级最大、种类最多的语料库用于联合多语种多任务学习，所以HanLP的标注集也是覆盖面最广的。通过`doc.pretty_print`，可以在等宽字体环境中得到可视化，你需要取消换行才能对齐可视化结果。我们已经发布HTML环境的可视化，在Jupyter Notebook中自动对齐中文。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 575
    },
    "id": "GZ79la4LK7J8",
    "outputId": "b9bd5dc0-52f9-4b42-93fd-7c4e49214ace"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Dep&nbsp;Tree&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────&nbsp;<br>&nbsp;┌─────────►&nbsp;<br>&nbsp;│┌────────►&nbsp;<br>&nbsp;││┌─►┌─────&nbsp;<br>&nbsp;│││&nbsp;&nbsp;│&nbsp;&nbsp;┌─►&nbsp;<br>&nbsp;│││&nbsp;&nbsp;└─►└──&nbsp;<br>┌┼┴┴────────&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>││&nbsp;&nbsp;┌───►└──&nbsp;<br>││&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>││&nbsp;&nbsp;│┌──►├──&nbsp;<br>││&nbsp;&nbsp;││&nbsp;&nbsp;&nbsp;└─►&nbsp;<br>││&nbsp;&nbsp;││&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>││&nbsp;&nbsp;││┌─►└──&nbsp;<br>││&nbsp;&nbsp;│││&nbsp;&nbsp;┌─►&nbsp;<br>│└─►└┴┴──┴──&nbsp;<br>└──────────►&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Token&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Relati&nbsp;<br>──────&nbsp;<br>tmod&nbsp;&nbsp;&nbsp;<br>nsubj&nbsp;&nbsp;<br>prep&nbsp;&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>pobj&nbsp;&nbsp;&nbsp;<br>root&nbsp;&nbsp;&nbsp;<br>amod&nbsp;&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>advmod&nbsp;<br>rcmod&nbsp;&nbsp;<br>assm&nbsp;&nbsp;&nbsp;<br>nummod&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>dobj&nbsp;&nbsp;&nbsp;<br>punct&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">PoS&nbsp;<br>───&nbsp;<br>NT&nbsp;&nbsp;<br>NR&nbsp;&nbsp;<br>P&nbsp;&nbsp;&nbsp;<br>NN&nbsp;&nbsp;<br>NN&nbsp;&nbsp;<br>VV&nbsp;&nbsp;<br>JJ&nbsp;&nbsp;<br>NN&nbsp;&nbsp;<br>AD&nbsp;&nbsp;<br>JJ&nbsp;&nbsp;<br>DEG&nbsp;<br>CD&nbsp;&nbsp;<br>NN&nbsp;&nbsp;<br>NR&nbsp;&nbsp;<br>NN&nbsp;&nbsp;<br>PU&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">NER&nbsp;Type&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────────&nbsp;<br>───►DATE&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>───►ORGANIZATION&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────&nbsp;<br>───►ARGM-TMP&nbsp;<br>───►ARG0&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►ARG2&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>╟──►PRED&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►ARG1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA2&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>───►ARGM-ADV&nbsp;<br>╟──►PRED&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>───►ARG0&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">PoS&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;9&nbsp;<br>─────────────────────────────────────────────────────────<br>NT&nbsp;───────────────────────────────────────────►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;<br>NR&nbsp;───────────────────────────────────────────►NP────┤&nbsp;&nbsp;&nbsp;<br>P&nbsp;───────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NN&nbsp;──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├────────────────────────►PP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NN&nbsp;──┴►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>VV&nbsp;──────────────────────────────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>JJ&nbsp;───►ADJP──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP────┤&nbsp;&nbsp;&nbsp;<br>NN&nbsp;───►NP&nbsp;───┴►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>AD&nbsp;───────────►ADVP──┼►ADJP──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP<br>JJ&nbsp;───────────►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>DEG──────────────────────────┤&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>CD&nbsp;───►QP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NN&nbsp;───►NP&nbsp;───┴────────►NP────┤&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NR&nbsp;──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NN&nbsp;──┴────────────────►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>PU&nbsp;──────────────────────────────────────────────────┘&nbsp;&nbsp;&nbsp;</pre></div><br><div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Dep&nbsp;Tree&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>┌┬────┬──┴──&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;┌─►&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;└─►└──&nbsp;<br>│└─►┌───────&nbsp;<br>│&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;┌───►&nbsp;<br>│&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;│┌──►&nbsp;<br>│&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;││┌─►&nbsp;<br>│&nbsp;&nbsp;&nbsp;└─►└┴┴──&nbsp;<br>└──────────►&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;<br>───&nbsp;<br>阿婆主&nbsp;<br>来到&nbsp;&nbsp;<br>北京&nbsp;&nbsp;<br>立方庭&nbsp;<br>参观&nbsp;&nbsp;<br>自然&nbsp;&nbsp;<br>语义&nbsp;&nbsp;<br>科技&nbsp;&nbsp;<br>公司&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Relat&nbsp;<br>─────&nbsp;<br>nsubj&nbsp;<br>root&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;<br>dobj&nbsp;&nbsp;<br>conj&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;<br>dobj&nbsp;&nbsp;<br>punct&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Po&nbsp;<br>──&nbsp;<br>NN&nbsp;<br>VV&nbsp;<br>NR&nbsp;<br>NR&nbsp;<br>VV&nbsp;<br>NN&nbsp;<br>NN&nbsp;<br>NN&nbsp;<br>NN&nbsp;<br>PU&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;<br>───&nbsp;<br>阿婆主&nbsp;<br>来到&nbsp;&nbsp;<br>北京&nbsp;&nbsp;<br>立方庭&nbsp;<br>参观&nbsp;&nbsp;<br>自然&nbsp;&nbsp;<br>语义&nbsp;&nbsp;<br>科技&nbsp;&nbsp;<br>公司&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">NER&nbsp;Type&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┴►LOCATION&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►ORGANIZATION&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;<br>───&nbsp;<br>阿婆主&nbsp;<br>来到&nbsp;&nbsp;<br>北京&nbsp;&nbsp;<br>立方庭&nbsp;<br>参观&nbsp;&nbsp;<br>自然&nbsp;&nbsp;<br>语义&nbsp;&nbsp;<br>科技&nbsp;&nbsp;<br>公司&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA1&nbsp;&nbsp;<br>────────&nbsp;<br>───►ARG0&nbsp;<br>╟──►PRED&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┴►ARG1&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;<br>───&nbsp;<br>阿婆主&nbsp;<br>来到&nbsp;&nbsp;<br>北京&nbsp;&nbsp;<br>立方庭&nbsp;<br>参观&nbsp;&nbsp;<br>自然&nbsp;&nbsp;<br>语义&nbsp;&nbsp;<br>科技&nbsp;&nbsp;<br>公司&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA2&nbsp;&nbsp;<br>────────&nbsp;<br>───►ARG0&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>╟──►PRED&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►ARG1&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;<br>───&nbsp;<br>阿婆主&nbsp;<br>来到&nbsp;&nbsp;<br>北京&nbsp;&nbsp;<br>立方庭&nbsp;<br>参观&nbsp;&nbsp;<br>自然&nbsp;&nbsp;<br>语义&nbsp;&nbsp;<br>科技&nbsp;&nbsp;<br>公司&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Po&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6&nbsp;<br>────────────────────────────────<br>NN───────────────────►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;<br>VV──────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NR──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NR──┴►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>VV──────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP────┤&nbsp;&nbsp;&nbsp;<br>NN──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP<br>NN&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NN&nbsp;&nbsp;├►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NN──┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>PU──────────────────────────┘&nbsp;&nbsp;&nbsp;</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {
      "tags": []
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "doc.pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WIKyCLQJK7J9"
   },
   "source": [
    "#### 申请秘钥\n",
    "由于服务器算力有限，匿名用户每分钟限2次调用。如果你需要更多调用次数，[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "PcZAZopQK7J9"
   },
   "source": [
    "### 海量级native API\n",
    "\n",
    "依赖PyTorch、TensorFlow等深度学习技术，适合**专业**NLP工程师、研究者以及本地海量数据场景。要求Python 3.6以上，支持Windows，推荐*nix。可以在CPU上运行，推荐GPU/TPU。\n",
    "\n",
    "无论是Windows、Linux还是macOS，HanLP的安装只需一句话搞定。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "bjRdHxl1K7J-",
    "outputId": "659d7920-c857-4eb8-f45f-dba84366688a"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: hanlp in /usr/local/lib/python3.7/dist-packages (2.1.0a54)\n",
      "Requirement already satisfied: sentencepiece>=0.1.91torch>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from hanlp) (0.1.96)\n",
      "Requirement already satisfied: toposort==1.5 in /usr/local/lib/python3.7/dist-packages (from hanlp) (1.5)\n",
      "Requirement already satisfied: alnlp in /usr/local/lib/python3.7/dist-packages (from hanlp) (1.0.0rc27)\n",
      "Requirement already satisfied: hanlp-common>=0.0.9 in /usr/local/lib/python3.7/dist-packages (from hanlp) (0.0.9)\n",
      "Requirement already satisfied: hanlp-downloader in /usr/local/lib/python3.7/dist-packages (from hanlp) (0.0.23)\n",
      "Requirement already satisfied: hanlp-trie>=0.0.2 in /usr/local/lib/python3.7/dist-packages (from hanlp) (0.0.2)\n",
      "Requirement already satisfied: transformers>=4.1.1 in /usr/local/lib/python3.7/dist-packages (from hanlp) (4.9.1)\n",
      "Requirement already satisfied: termcolor in /usr/local/lib/python3.7/dist-packages (from hanlp) (1.1.0)\n",
      "Requirement already satisfied: pynvml in /usr/local/lib/python3.7/dist-packages (from hanlp) (11.0.0)\n",
      "Requirement already satisfied: phrasetree in /usr/local/lib/python3.7/dist-packages (from hanlp-common>=0.0.9->hanlp) (0.0.8)\n",
      "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (3.0.12)\n",
      "Requirement already satisfied: sacremoses in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (0.0.45)\n",
      "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (0.10.3)\n",
      "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (21.0)\n",
      "Requirement already satisfied: huggingface-hub==0.0.12 in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (0.0.12)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (5.4.1)\n",
      "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (2019.12.20)\n",
      "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (4.41.1)\n",
      "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (2.23.0)\n",
      "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (4.6.1)\n",
      "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (1.19.5)\n",
      "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from huggingface-hub==0.0.12->transformers>=4.1.1->hanlp) (3.7.4.3)\n",
      "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers>=4.1.1->hanlp) (2.4.7)\n",
      "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from alnlp->hanlp) (1.9.0+cu102)\n",
      "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers>=4.1.1->hanlp) (3.5.0)\n",
      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers>=4.1.1->hanlp) (1.24.3)\n",
      "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers>=4.1.1->hanlp) (3.0.4)\n",
      "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers>=4.1.1->hanlp) (2.10)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers>=4.1.1->hanlp) (2021.5.30)\n",
      "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers>=4.1.1->hanlp) (1.0.1)\n",
      "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers>=4.1.1->hanlp) (7.1.2)\n",
      "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers>=4.1.1->hanlp) (1.15.0)\n"
     ]
    }
   ],
   "source": [
    "!pip install hanlp -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "dHhIRwgqK7J-"
   },
   "source": [
    "#### 加载模型\n",
    "HanLP的工作流程是先加载模型，模型的标示符存储在`hanlp.pretrained`这个包中，按照NLP任务归类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "KHY6bsG_K7J-",
    "outputId": "208c12b6-2702-4ee7-a03a-f053b7ad3479"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n",
       " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n",
       " 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210517_225654.zip',\n",
       " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n",
       " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n",
       " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n",
       " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip'}"
      ]
     },
     "execution_count": 6,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hanlp\n",
    "hanlp.pretrained.mtl.ALL # MTL多任务，具体任务见模型名称，语种见名称最后一个字段或相应语料库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "WDT3Hks0K7J_"
   },
   "source": [
    "调用`hanlp.load`进行加载，模型会自动下载到本地缓存。自然语言处理分为许多任务，分词只是最初级的一个。与其每个任务单独创建一个模型，不如利用HanLP的联合模型一次性完成多个任务："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "4Cj8a73rK7J_",
    "outputId": "a92ac736-6e61-4949-8d35-56c773faf950"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": []
    }
   ],
   "source": [
    "HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pBqH_My8K7J_"
   },
   "source": [
    "## 多任务批量分析\n",
    "客户端创建完毕，或者模型加载完毕后，就可以传入一个或多个句子进行分析了："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "B58npfkHK7J_",
    "outputId": "69fed02d-39cb-4b4c-d2c8-d0edc25970ea"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"tok/fine\": [\n",
      "    [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n",
      "    [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]\n",
      "  ],\n",
      "  \"tok/coarse\": [\n",
      "    [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
      "    [\"阿婆主\", \"来到\", \"北京立方庭\", \"参观\", \"自然语义科技公司\", \"。\"]\n",
      "  ],\n",
      "  \"pos/ctb\": [\n",
      "    [\"NT\", \"NR\", \"P\", \"NN\", \"NN\", \"VV\", \"JJ\", \"NN\", \"AD\", \"JJ\", \"DEG\", \"CD\", \"NN\", \"NR\", \"NN\", \"PU\"],\n",
      "    [\"NN\", \"VV\", \"NR\", \"NR\", \"VV\", \"NN\", \"NN\", \"NN\", \"NN\", \"PU\"]\n",
      "  ],\n",
      "  \"pos/pku\": [\n",
      "    [\"t\", \"nx\", \"p\", \"vn\", \"n\", \"v\", \"b\", \"n\", \"d\", \"a\", \"u\", \"a\", \"n\", \"nx\", \"n\", \"w\"],\n",
      "    [\"n\", \"v\", \"ns\", \"ns\", \"v\", \"n\", \"n\", \"n\", \"n\", \"w\"]\n",
      "  ],\n",
      "  \"pos/863\": [\n",
      "    [\"nt\", \"w\", \"p\", \"v\", \"n\", \"v\", \"a\", \"nt\", \"d\", \"a\", \"u\", \"a\", \"n\", \"ws\", \"n\", \"w\"],\n",
      "    [\"n\", \"v\", \"ns\", \"n\", \"v\", \"n\", \"n\", \"n\", \"n\", \"w\"]\n",
      "  ],\n",
      "  \"ner/msra\": [\n",
      "    [[\"2021年\", \"DATE\", 0, 1], [\"HanLPv2.1\", \"WWW\", 1, 2]],\n",
      "    [[\"北京\", \"LOCATION\", 2, 3], [\"立方庭\", \"LOCATION\", 3, 4], [\"自然语义科技公司\", \"ORGANIZATION\", 5, 9]]\n",
      "  ],\n",
      "  \"ner/pku\": [\n",
      "    [],\n",
      "    [[\"北京立方庭\", \"ns\", 2, 4], [\"自然语义科技公司\", \"nt\", 5, 9]]\n",
      "  ],\n",
      "  \"ner/ontonotes\": [\n",
      "    [[\"2021年\", \"DATE\", 0, 1], [\"HanLPv2.1\", \"ORG\", 1, 2]],\n",
      "    [[\"北京立方庭\", \"FAC\", 2, 4], [\"自然语义科技公司\", \"ORG\", 5, 9]]\n",
      "  ],\n",
      "  \"srl\": [\n",
      "    [[[\"2021年\", \"ARGM-TMP\", 0, 1], [\"HanLPv2.1\", \"ARG0\", 1, 2], [\"为生产环境\", \"ARG2\", 2, 5], [\"带来\", \"PRED\", 5, 6], [\"次世代最先进的多语种NLP技术\", \"ARG1\", 6, 15]], [[\"最\", \"ARGM-ADV\", 8, 9], [\"先进\", \"PRED\", 9, 10], [\"技术\", \"ARG0\", 14, 15]]],\n",
      "    [[[\"阿婆主\", \"ARG0\", 0, 1], [\"来到\", \"PRED\", 1, 2], [\"北京立方庭\", \"ARG1\", 2, 4]], [[\"阿婆主\", \"ARG0\", 0, 1], [\"参观\", \"PRED\", 4, 5], [\"自然语义科技公司\", \"ARG1\", 5, 9]]]\n",
      "  ],\n",
      "  \"dep\": [\n",
      "    [[6, \"tmod\"], [6, \"nsubj\"], [6, \"prep\"], [5, \"nn\"], [3, \"pobj\"], [0, \"root\"], [8, \"amod\"], [15, \"nn\"], [10, \"advmod\"], [15, \"rcmod\"], [10, \"assm\"], [13, \"nummod\"], [15, \"nn\"], [15, \"nn\"], [6, \"dobj\"], [6, \"punct\"]],\n",
      "    [[2, \"nsubj\"], [0, \"root\"], [4, \"nn\"], [2, \"dobj\"], [2, \"conj\"], [9, \"nn\"], [9, \"nn\"], [9, \"nn\"], [5, \"dobj\"], [2, \"punct\"]]\n",
      "  ],\n",
      "  \"sdp\": [\n",
      "    [[[6, \"Time\"]], [[6, \"Exp\"]], [[5, \"mPrep\"]], [[5, \"Desc\"]], [[6, \"Datv\"]], [[13, \"dDesc\"]], [[0, \"Root\"], [8, \"Desc\"], [13, \"Desc\"]], [[15, \"Time\"]], [[10, \"mDegr\"]], [[15, \"Desc\"]], [[10, \"mAux\"]], [[8, \"Quan\"], [13, \"Quan\"]], [[15, \"Desc\"]], [[15, \"Nmod\"]], [[6, \"Pat\"]], [[6, \"mPunc\"]]],\n",
      "    [[[2, \"Agt\"], [5, \"Agt\"]], [[0, \"Root\"]], [[4, \"Loc\"]], [[2, \"Lfin\"]], [[2, \"ePurp\"]], [[8, \"Nmod\"]], [[9, \"Nmod\"]], [[9, \"Nmod\"]], [[5, \"Datv\"]], [[5, \"mPunc\"]]]\n",
      "  ],\n",
      "  \"con\": [\n",
      "    [\"TOP\", [[\"IP\", [[\"NP\", [[\"NT\", [\"2021年\"]]]], [\"NP\", [[\"NR\", [\"HanLPv2.1\"]]]], [\"VP\", [[\"PP\", [[\"P\", [\"为\"]], [\"NP\", [[\"NN\", [\"生产\"]], [\"NN\", [\"环境\"]]]]]], [\"VP\", [[\"VV\", [\"带来\"]], [\"NP\", [[\"ADJP\", [[\"NP\", [[\"ADJP\", [[\"JJ\", [\"次\"]]]], [\"NP\", [[\"NN\", [\"世代\"]]]]]], [\"ADVP\", [[\"AD\", [\"最\"]]]], [\"VP\", [[\"JJ\", [\"先进\"]]]]]], [\"DEG\", [\"的\"]], [\"NP\", [[\"QP\", [[\"CD\", [\"多\"]]]], [\"NP\", [[\"NN\", [\"语种\"]]]]]], [\"NP\", [[\"NR\", [\"NLP\"]], [\"NN\", [\"技术\"]]]]]]]]]], [\"PU\", [\"。\"]]]]]],\n",
      "    [\"TOP\", [[\"IP\", [[\"NP\", [[\"NN\", [\"阿婆主\"]]]], [\"VP\", [[\"VP\", [[\"VV\", [\"来到\"]], [\"NP\", [[\"NR\", [\"北京\"]], [\"NR\", [\"立方庭\"]]]]]], [\"VP\", [[\"VV\", [\"参观\"]], [\"NP\", [[\"NN\", [\"自然\"]], [\"NN\", [\"语义\"]], [\"NN\", [\"科技\"]], [\"NN\", [\"公司\"]]]]]]]], [\"PU\", [\"。\"]]]]]]\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "doc = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'])\n",
    "print(doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "tvuxfWPYK7J_"
   },
   "source": [
    "## 可视化\n",
    "输出结果是一个可以`json`化的`dict`，键为[NLP任务名](https://hanlp.hankcs.com/docs/data_format.html#naming-convention)，值为分析结果。关于标注集含义，请参考[《语言学标注规范》](https://hanlp.hankcs.com/docs/annotations/index.html)及[《格式规范》](https://hanlp.hankcs.com/docs/data_format.html)。我们购买、标注或采用了世界上量级最大、种类最多的语料库用于联合多语种多任务学习，所以HanLP的标注集也是覆盖面最广的。通过`doc.pretty_print`，可以在等宽字体环境中得到可视化，你需要取消换行才能对齐可视化结果。我们已经发布HTML环境的可视化，在Jupyter Notebook中自动对齐中文。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 575
    },
    "id": "M8WxTdlAK7KA",
    "outputId": "a027a302-74d8-48c9-b30d-45ebf8741c1e"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Dep&nbsp;Tree&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────&nbsp;<br>&nbsp;┌─────────►&nbsp;<br>&nbsp;│┌────────►&nbsp;<br>&nbsp;││┌─►┌─────&nbsp;<br>&nbsp;│││&nbsp;&nbsp;│&nbsp;&nbsp;┌─►&nbsp;<br>&nbsp;│││&nbsp;&nbsp;└─►└──&nbsp;<br>┌┼┴┴────────&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>││&nbsp;&nbsp;┌───►└──&nbsp;<br>││&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>││&nbsp;&nbsp;│┌──►├──&nbsp;<br>││&nbsp;&nbsp;││&nbsp;&nbsp;&nbsp;└─►&nbsp;<br>││&nbsp;&nbsp;││&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>││&nbsp;&nbsp;││┌─►└──&nbsp;<br>││&nbsp;&nbsp;│││&nbsp;&nbsp;┌─►&nbsp;<br>│└─►└┴┴──┴──&nbsp;<br>└──────────►&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Token&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Relati&nbsp;<br>──────&nbsp;<br>tmod&nbsp;&nbsp;&nbsp;<br>nsubj&nbsp;&nbsp;<br>prep&nbsp;&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>pobj&nbsp;&nbsp;&nbsp;<br>root&nbsp;&nbsp;&nbsp;<br>amod&nbsp;&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>advmod&nbsp;<br>rcmod&nbsp;&nbsp;<br>assm&nbsp;&nbsp;&nbsp;<br>nummod&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>dobj&nbsp;&nbsp;&nbsp;<br>punct&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">PoS&nbsp;<br>───&nbsp;<br>NT&nbsp;&nbsp;<br>NR&nbsp;&nbsp;<br>P&nbsp;&nbsp;&nbsp;<br>NN&nbsp;&nbsp;<br>NN&nbsp;&nbsp;<br>VV&nbsp;&nbsp;<br>JJ&nbsp;&nbsp;<br>NN&nbsp;&nbsp;<br>AD&nbsp;&nbsp;<br>JJ&nbsp;&nbsp;<br>DEG&nbsp;<br>CD&nbsp;&nbsp;<br>NN&nbsp;&nbsp;<br>NR&nbsp;&nbsp;<br>NN&nbsp;&nbsp;<br>PU&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">NER&nbsp;Type&nbsp;<br>────────&nbsp;<br>───►DATE&nbsp;<br>───►WWW&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────&nbsp;<br>───►ARGM-TMP&nbsp;<br>───►ARG0&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►ARG2&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>╟──►PRED&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►ARG1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA2&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>───►ARGM-ADV&nbsp;<br>╟──►PRED&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>───►ARG0&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">PoS&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;9&nbsp;<br>─────────────────────────────────────────────────────────<br>NT&nbsp;───────────────────────────────────────────►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;<br>NR&nbsp;───────────────────────────────────────────►NP────┤&nbsp;&nbsp;&nbsp;<br>P&nbsp;───────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NN&nbsp;──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├────────────────────────►PP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NN&nbsp;──┴►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>VV&nbsp;──────────────────────────────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>JJ&nbsp;───►ADJP──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP────┤&nbsp;&nbsp;&nbsp;<br>NN&nbsp;───►NP&nbsp;───┴►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>AD&nbsp;───────────►ADVP──┼►ADJP──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP<br>JJ&nbsp;───────────►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>DEG──────────────────────────┤&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>CD&nbsp;───►QP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NN&nbsp;───►NP&nbsp;───┴────────►NP────┤&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NR&nbsp;──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NN&nbsp;──┴────────────────►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>PU&nbsp;──────────────────────────────────────────────────┘&nbsp;&nbsp;&nbsp;</pre></div><br><div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Dep&nbsp;Tree&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>┌┬────┬──┴──&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;┌─►&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;└─►└──&nbsp;<br>│└─►┌───────&nbsp;<br>│&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;┌───►&nbsp;<br>│&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;│┌──►&nbsp;<br>│&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;││┌─►&nbsp;<br>│&nbsp;&nbsp;&nbsp;└─►└┴┴──&nbsp;<br>└──────────►&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;<br>───&nbsp;<br>阿婆主&nbsp;<br>来到&nbsp;&nbsp;<br>北京&nbsp;&nbsp;<br>立方庭&nbsp;<br>参观&nbsp;&nbsp;<br>自然&nbsp;&nbsp;<br>语义&nbsp;&nbsp;<br>科技&nbsp;&nbsp;<br>公司&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Relat&nbsp;<br>─────&nbsp;<br>nsubj&nbsp;<br>root&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;<br>dobj&nbsp;&nbsp;<br>conj&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;<br>dobj&nbsp;&nbsp;<br>punct&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Po&nbsp;<br>──&nbsp;<br>NN&nbsp;<br>VV&nbsp;<br>NR&nbsp;<br>NR&nbsp;<br>VV&nbsp;<br>NN&nbsp;<br>NN&nbsp;<br>NN&nbsp;<br>NN&nbsp;<br>PU&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;<br>───&nbsp;<br>阿婆主&nbsp;<br>来到&nbsp;&nbsp;<br>北京&nbsp;&nbsp;<br>立方庭&nbsp;<br>参观&nbsp;&nbsp;<br>自然&nbsp;&nbsp;<br>语义&nbsp;&nbsp;<br>科技&nbsp;&nbsp;<br>公司&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">NER&nbsp;Type&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>───►LOCATION&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>───►LOCATION&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►ORGANIZATION&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;<br>───&nbsp;<br>阿婆主&nbsp;<br>来到&nbsp;&nbsp;<br>北京&nbsp;&nbsp;<br>立方庭&nbsp;<br>参观&nbsp;&nbsp;<br>自然&nbsp;&nbsp;<br>语义&nbsp;&nbsp;<br>科技&nbsp;&nbsp;<br>公司&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA1&nbsp;&nbsp;<br>────────&nbsp;<br>───►ARG0&nbsp;<br>╟──►PRED&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┴►ARG1&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;<br>───&nbsp;<br>阿婆主&nbsp;<br>来到&nbsp;&nbsp;<br>北京&nbsp;&nbsp;<br>立方庭&nbsp;<br>参观&nbsp;&nbsp;<br>自然&nbsp;&nbsp;<br>语义&nbsp;&nbsp;<br>科技&nbsp;&nbsp;<br>公司&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA2&nbsp;&nbsp;<br>────────&nbsp;<br>───►ARG0&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>╟──►PRED&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►ARG1&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;<br>───&nbsp;<br>阿婆主&nbsp;<br>来到&nbsp;&nbsp;<br>北京&nbsp;&nbsp;<br>立方庭&nbsp;<br>参观&nbsp;&nbsp;<br>自然&nbsp;&nbsp;<br>语义&nbsp;&nbsp;<br>科技&nbsp;&nbsp;<br>公司&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Po&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6&nbsp;<br>────────────────────────────────<br>NN───────────────────►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;<br>VV──────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NR──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NR──┴►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>VV──────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP────┤&nbsp;&nbsp;&nbsp;<br>NN──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP<br>NN&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NN&nbsp;&nbsp;├►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NN──┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>PU──────────────────────────┘&nbsp;&nbsp;&nbsp;</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {
      "tags": []
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "doc.pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_B2HDiZgK7KA"
   },
   "source": [
    "## 指定任务\n",
    "简洁的接口也支持灵活的参数，任务越少，速度越快。如指定仅执行分词："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "id": "9Mnys4t2K7KA",
    "outputId": "88d72a72-c095-4f6d-df0b-d881887087ce"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">阿婆主&nbsp;来到&nbsp;北京&nbsp;立方庭&nbsp;参观&nbsp;自然&nbsp;语义&nbsp;科技&nbsp;公司&nbsp;。</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {
      "tags": []
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='tok').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "s5RkVkVkK7KA"
   },
   "source": [
    "### 执行粗颗粒度分词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "id": "5R_PwELlK7KA",
    "outputId": "5ce2c037-eb44-481f-9de2-dc0d4122e7c4"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">阿婆主&nbsp;来到&nbsp;北京立方庭&nbsp;参观&nbsp;自然语义科技公司&nbsp;。</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {
      "tags": []
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='tok/coarse').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pTrajkHEK7KB"
   },
   "source": [
    "### 执行分词和PKU词性标注"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "id": "kkkgVKFqK7KB",
    "outputId": "e9f9879b-47ce-459a-e089-923de1c6436c"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">阿婆主/n&nbsp;来到/v&nbsp;北京/ns&nbsp;立方庭/ns&nbsp;参观/v&nbsp;自然/n&nbsp;语义/n&nbsp;科技/n&nbsp;公司/n&nbsp;。/w</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {
      "tags": []
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='pos/pku').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "YLLTVY0RK7KB"
   },
   "source": [
    "### 执行粗颗粒度分词和PKU词性标注"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "id": "5qSlqbcfK7KB",
    "outputId": "66944459-bc22-4bd9-e4af-4d2aba9316f3"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">阿婆主/n&nbsp;来到/v&nbsp;北京立方庭/ns&nbsp;参观/v&nbsp;自然语义科技公司/n&nbsp;。/w</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {
      "tags": []
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks=['tok/coarse', 'pos/pku'], skip_tasks='tok/fine').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "3nNojvHiK7KB"
   },
   "source": [
    "### 执行分词和MSRA标准NER"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 225
    },
    "id": "tTVoEPiAK7KB",
    "outputId": "b8dc8c24-3392-4712-d1b6-e2dc8b7710e8"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;<br>───&nbsp;<br>阿婆主&nbsp;<br>来到&nbsp;&nbsp;<br>北京&nbsp;&nbsp;<br>立方庭&nbsp;<br>参观&nbsp;&nbsp;<br>自然&nbsp;&nbsp;<br>语义&nbsp;&nbsp;<br>科技&nbsp;&nbsp;<br>公司&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">NER&nbsp;Type&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────────<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>───►LOCATION&nbsp;&nbsp;&nbsp;&nbsp;<br>───►LOCATION&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►ORGANIZATION<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {
      "tags": []
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='ner/msra').pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "uG2wYTfmK7KB"
   },
   "source": [
    "### 执行分词、词性标注和依存句法分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 225
    },
    "id": "WXl6f7zyK7KC",
    "outputId": "8671e0e4-d0c3-40f4-a4db-ba9aaec225ab"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Dep&nbsp;Tree&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>┌┬────┬──┴──&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;┌─►&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;└─►└──&nbsp;<br>│└─►┌───────&nbsp;<br>│&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;┌───►&nbsp;<br>│&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;│┌──►&nbsp;<br>│&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;││┌─►&nbsp;<br>│&nbsp;&nbsp;&nbsp;└─►└┴┴──&nbsp;<br>└──────────►&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;<br>───&nbsp;<br>阿婆主&nbsp;<br>来到&nbsp;&nbsp;<br>北京&nbsp;&nbsp;<br>立方庭&nbsp;<br>参观&nbsp;&nbsp;<br>自然&nbsp;&nbsp;<br>语义&nbsp;&nbsp;<br>科技&nbsp;&nbsp;<br>公司&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Relat&nbsp;<br>─────&nbsp;<br>nsubj&nbsp;<br>root&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;<br>dobj&nbsp;&nbsp;<br>conj&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;<br>nn&nbsp;&nbsp;&nbsp;&nbsp;<br>dobj&nbsp;&nbsp;<br>punct&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Po<br>──<br>NN<br>VV<br>NR<br>NR<br>VV<br>NN<br>NN<br>NN<br>NN<br>PU</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {
      "tags": []
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "doc = HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks=['pos', 'dep'])\n",
    "doc.pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "ocxM3LsGK7KC"
   },
   "source": [
    "转换为CoNLL格式："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "NtKmSB_0K7KC",
    "outputId": "cc9245b3-32c2-4d35-88a8-a7d91127eca7"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t阿婆主\t_\tNN\t_\t_\t2\tnsubj\t_\t_\n",
      "2\t来到\t_\tVV\t_\t_\t0\troot\t_\t_\n",
      "3\t北京\t_\tNR\t_\t_\t4\tnn\t_\t_\n",
      "4\t立方庭\t_\tNR\t_\t_\t2\tdobj\t_\t_\n",
      "5\t参观\t_\tVV\t_\t_\t2\tconj\t_\t_\n",
      "6\t自然\t_\tNN\t_\t_\t9\tnn\t_\t_\n",
      "7\t语义\t_\tNN\t_\t_\t9\tnn\t_\t_\n",
      "8\t科技\t_\tNN\t_\t_\t9\tnn\t_\t_\n",
      "9\t公司\t_\tNN\t_\t_\t5\tdobj\t_\t_\n",
      "10\t。\t_\tPU\t_\t_\t2\tpunct\t_\t_\n"
     ]
    }
   ],
   "source": [
    "print(doc.to_conll())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "PNBo-kETK7KC"
   },
   "source": [
    "### 执行分词、词性标注和短语成分分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 225
    },
    "id": "Ja8dib6XK7KC",
    "outputId": "a972f5bb-ae23-47a9-cd9f-6070a5b39f50"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;<br>───&nbsp;<br>阿婆主&nbsp;<br>来到&nbsp;&nbsp;<br>北京&nbsp;&nbsp;<br>立方庭&nbsp;<br>参观&nbsp;&nbsp;<br>自然&nbsp;&nbsp;<br>语义&nbsp;&nbsp;<br>科技&nbsp;&nbsp;<br>公司&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Po&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6&nbsp;<br>────────────────────────────────<br>NN───────────────────►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;<br>VV──────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NR──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NR──┴►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>VV──────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP────┤&nbsp;&nbsp;&nbsp;<br>NN──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP<br>NN&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NN&nbsp;&nbsp;├►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NN──┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>PU──────────────────────────┘&nbsp;&nbsp;&nbsp;</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {
      "tags": []
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "doc = HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks=['pos', 'con'])\n",
    "doc.pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Mg3DhvjhK7KC"
   },
   "source": [
    "#### 将短语结构树以bracketed形式打印"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "kE8iBZNUK7KC",
    "outputId": "79e2a72d-e473-41ca-c054-9595a4dd5971"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(TOP\n",
      "  (IP\n",
      "    (NP (NN 阿婆主))\n",
      "    (VP\n",
      "      (VP (VV 来到) (NP (NR 北京) (NR 立方庭)))\n",
      "      (VP (VV 参观) (NP (NN 自然) (NN 语义) (NN 科技) (NN 公司))))\n",
      "    (PU 。)))\n"
     ]
    }
   ],
   "source": [
    "print(doc['con'])  # str(doc['con'])会将短语结构列表转换为括号形式"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "MfleaY_pK7KC"
   },
   "source": [
    "关于标注集含义，请参考[《语言学标注规范》](https://hanlp.hankcs.com/docs/annotations/index.html)及[《格式规范》](https://hanlp.hankcs.com/docs/data_format.html)。我们购买、标注或采用了世界上量级最大、种类最多的语料库用于联合多语种多任务学习，所以HanLP的标注集也是覆盖面最广的。\n",
    "\n",
    "## 多语种支持\n",
    "总之，可以通过tasks参数灵活调用各种NLP任务。除了中文联合模型之外，你可以在文档中通过找到许多其他语种的模型，比如日语："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "oJP8dvfvK7KD",
    "outputId": "2262ccdb-7cf5-4859-8d6c-18300e54c22e"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": []
    }
   ],
   "source": [
    "ja = hanlp.load(hanlp.pretrained.mtl.NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 991
    },
    "id": "3WPvCbH2K7KD",
    "outputId": "46a9435d-ed5b-47ef-99c6-71d7ee0fc6e8"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Dep&nbsp;Tree&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>──────────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>┌─────────►├──&nbsp;<br>│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;└─►&nbsp;<br>│&nbsp;&nbsp;&nbsp;┌────────►&nbsp;<br>│&nbsp;&nbsp;&nbsp;│┌───────►&nbsp;<br>│&nbsp;&nbsp;&nbsp;││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>│&nbsp;&nbsp;&nbsp;││┌───►├──&nbsp;<br>│&nbsp;&nbsp;&nbsp;│││&nbsp;&nbsp;&nbsp;&nbsp;└─►&nbsp;<br>│&nbsp;&nbsp;&nbsp;│││┌─────►&nbsp;<br>│&nbsp;&nbsp;&nbsp;││││┌────►&nbsp;<br>│&nbsp;&nbsp;&nbsp;│││││┌───►&nbsp;<br>│&nbsp;&nbsp;&nbsp;││││││┌──►&nbsp;<br>│&nbsp;&nbsp;&nbsp;│││││││┌─►&nbsp;<br>│┌─►└┴┴┴┴┴┴┼──&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;└─►&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►├──&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;└─►&nbsp;<br>└┴──────┴┬┬┬──&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;││└─►&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│└──►&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;└───►&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Token&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先端&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>言語&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技術&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>を&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>本番&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>環境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>導入&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>し&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>ます&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Relation&nbsp;<br>────────&nbsp;<br>nummod&nbsp;&nbsp;&nbsp;<br>obl&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>punct&nbsp;&nbsp;&nbsp;&nbsp;<br>compound&nbsp;<br>case&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>compound&nbsp;<br>nmod&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>case&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>compound&nbsp;<br>compound&nbsp;<br>compound&nbsp;<br>compound&nbsp;<br>compound&nbsp;<br>obj&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>case&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>compound&nbsp;<br>obl&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>case&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>root&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>aux&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>aux&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>punct&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">PoS&nbsp;<br>───&nbsp;<br>NUM&nbsp;<br>CL&nbsp;&nbsp;<br>PU&nbsp;&nbsp;<br>NPR&nbsp;<br>P&nbsp;&nbsp;&nbsp;<br>N&nbsp;&nbsp;&nbsp;<br>N&nbsp;&nbsp;&nbsp;<br>P&nbsp;&nbsp;&nbsp;<br>N&nbsp;&nbsp;&nbsp;<br>N&nbsp;&nbsp;&nbsp;<br>NUM&nbsp;<br>N&nbsp;&nbsp;&nbsp;<br>N&nbsp;&nbsp;&nbsp;<br>N&nbsp;&nbsp;&nbsp;<br>P&nbsp;&nbsp;&nbsp;<br>N&nbsp;&nbsp;&nbsp;<br>N&nbsp;&nbsp;&nbsp;<br>P&nbsp;&nbsp;&nbsp;<br>VB&nbsp;&nbsp;<br>VB0&nbsp;<br>AX&nbsp;&nbsp;<br>PU&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先端&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>言語&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技術&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>を&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>本番&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>環境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>導入&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>し&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>ます&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">NER&nbsp;Type&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┴►DATE&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>───►ARTIFACT&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先端&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>言語&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技術&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>を&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>本番&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>環境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>導入&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>し&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>ます&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA1&nbsp;&nbsp;<br>────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>───►修飾&nbsp;&nbsp;&nbsp;<br>╟──►PRED&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先端&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>言語&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技術&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>を&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>本番&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>環境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>導入&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>し&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>ます&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA3&nbsp;&nbsp;<br>────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┴►修飾&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>╟──►PRED&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先端&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>言語&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技術&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>を&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>本番&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>環境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>導入&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>し&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>ます&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA4&nbsp;&nbsp;<br>────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►修飾&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┴►ノ&nbsp;&nbsp;&nbsp;&nbsp;<br>╟──►PRED&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先端&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>言語&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技術&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>を&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>本番&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>環境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>導入&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>し&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>ます&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA5&nbsp;&nbsp;<br>────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>───►修飾&nbsp;&nbsp;&nbsp;<br>╟──►PRED&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先端&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>言語&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技術&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>を&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>本番&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>環境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>導入&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>し&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>ます&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA6&nbsp;&nbsp;<br>────────&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►時間&nbsp;&nbsp;&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┴►ガ&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►ヲ&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►ニ&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>╟──►PRED&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先端&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>言語&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技術&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>を&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>本番&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>環境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>導入&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>し&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>ます&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">PoS&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;<br>────────────────────────────────────────────────────<br>NUM──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>CL&nbsp;──┴►NUMCLP────────&nbsp;───────────────────►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;<br>PU&nbsp;────────&nbsp;─────────&nbsp;──────────────────────────┤&nbsp;&nbsp;&nbsp;<br>NPR───►NP&nbsp;─────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>P&nbsp;─────────&nbsp;───┴►────&nbsp;───────────────────►PP────┤&nbsp;&nbsp;&nbsp;<br>N&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>N&nbsp;───┴►NP&nbsp;─────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>P&nbsp;─────────&nbsp;───┴►PP&nbsp;────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>N&nbsp;─────────&nbsp;─────────&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>N&nbsp;────►NP&nbsp;──────►CONJP──┤&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NUM────────&nbsp;─────────&nbsp;&nbsp;&nbsp;├►NML&nbsp;──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>N&nbsp;─────────&nbsp;─────────&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP<br>N&nbsp;─────────&nbsp;─────────&nbsp;──┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>N&nbsp;─────────&nbsp;─────────&nbsp;──────────┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►PP────┤&nbsp;&nbsp;&nbsp;<br>P&nbsp;─────────&nbsp;─────────&nbsp;──────────────────┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>N&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>N&nbsp;───┴►NP&nbsp;─────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>P&nbsp;─────────&nbsp;───┴►────&nbsp;───────────────────►PP────┤&nbsp;&nbsp;&nbsp;<br>VB&nbsp;────────&nbsp;─────────&nbsp;──────────────────────────┤&nbsp;&nbsp;&nbsp;<br>VB0────────&nbsp;─────────&nbsp;──────────────────────────┤&nbsp;&nbsp;&nbsp;<br>AX&nbsp;────────&nbsp;─────────&nbsp;──────────────────────────┤&nbsp;&nbsp;&nbsp;<br>PU&nbsp;────────&nbsp;─────────&nbsp;──────────────────────────┘&nbsp;&nbsp;&nbsp;</pre></div><br><div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Dep&nbsp;Tree&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>──────────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>┌─────────►├──&nbsp;<br>│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;└─►&nbsp;<br>│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─────►&nbsp;<br>│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│┌────►&nbsp;<br>│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;││┌───►&nbsp;<br>│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│││┌──►&nbsp;<br>│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;││││┌─►&nbsp;<br>│&nbsp;&nbsp;&nbsp;┌─►└┴┴┴┼──&nbsp;<br>│&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;└─►&nbsp;<br>│&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>│&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;┌─►└──&nbsp;<br>│&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;┌─►&nbsp;<br>│&nbsp;&nbsp;&nbsp;│┌─►└──┼──&nbsp;<br>│&nbsp;&nbsp;&nbsp;││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;└─►&nbsp;<br>│┌─►└┴─────┬──&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;└─►&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌──►&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│┌─►&nbsp;<br>││&nbsp;&nbsp;&nbsp;┌─►┌┬┼┼──&nbsp;<br>││&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;│││└─►&nbsp;<br>││&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;││└──►&nbsp;<br>││&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;│└───►&nbsp;<br>││&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;└────►&nbsp;<br>││&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>└┴───┴────┬┼──&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│└─►&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;└──►&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Toke&nbsp;<br>────&nbsp;<br>奈須&nbsp;&nbsp;&nbsp;<br>きのこ&nbsp;&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;<br>1973&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;<br>11&nbsp;&nbsp;&nbsp;<br>月&nbsp;&nbsp;&nbsp;&nbsp;<br>28&nbsp;&nbsp;&nbsp;<br>日&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;<br>千葉&nbsp;&nbsp;&nbsp;<br>県&nbsp;&nbsp;&nbsp;&nbsp;<br>円空&nbsp;&nbsp;&nbsp;<br>山&nbsp;&nbsp;&nbsp;&nbsp;<br>で&nbsp;&nbsp;&nbsp;&nbsp;<br>生まれ&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;<br>ゲーム&nbsp;&nbsp;<br>制作&nbsp;&nbsp;&nbsp;<br>会社&nbsp;&nbsp;&nbsp;<br>「&nbsp;&nbsp;&nbsp;&nbsp;<br>ノーツ&nbsp;&nbsp;<br>」&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;<br>設立&nbsp;&nbsp;&nbsp;<br>者&nbsp;&nbsp;&nbsp;&nbsp;<br>だ&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Relation&nbsp;<br>────────&nbsp;<br>compound&nbsp;<br>nsubj&nbsp;&nbsp;&nbsp;&nbsp;<br>case&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>compound&nbsp;<br>compound&nbsp;<br>compound&nbsp;<br>compound&nbsp;<br>nummod&nbsp;&nbsp;&nbsp;<br>obl&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>case&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>compound&nbsp;<br>nmod&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>compound&nbsp;<br>obl&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>case&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>acl&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>punct&nbsp;&nbsp;&nbsp;&nbsp;<br>compound&nbsp;<br>compound&nbsp;<br>nmod&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>punct&nbsp;&nbsp;&nbsp;&nbsp;<br>compound&nbsp;<br>punct&nbsp;&nbsp;&nbsp;&nbsp;<br>case&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>compound&nbsp;<br>root&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>cop&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>punct&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">PoS&nbsp;<br>───&nbsp;<br>NPR&nbsp;<br>NPR&nbsp;<br>P&nbsp;&nbsp;&nbsp;<br>NUM&nbsp;<br>CL&nbsp;&nbsp;<br>NUM&nbsp;<br>CL&nbsp;&nbsp;<br>NUM&nbsp;<br>CL&nbsp;&nbsp;<br>P&nbsp;&nbsp;&nbsp;<br>NPR&nbsp;<br>NPR&nbsp;<br>NPR&nbsp;<br>NPR&nbsp;<br>P&nbsp;&nbsp;&nbsp;<br>VB&nbsp;&nbsp;<br>PU&nbsp;&nbsp;<br>N&nbsp;&nbsp;&nbsp;<br>N&nbsp;&nbsp;&nbsp;<br>N&nbsp;&nbsp;&nbsp;<br>PUL&nbsp;<br>NPR&nbsp;<br>PUR&nbsp;<br>P&nbsp;&nbsp;&nbsp;<br>N&nbsp;&nbsp;&nbsp;<br>N&nbsp;&nbsp;&nbsp;<br>AX&nbsp;&nbsp;<br>PU&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;<br>────&nbsp;<br>奈須&nbsp;&nbsp;&nbsp;<br>きのこ&nbsp;&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;<br>1973&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;<br>11&nbsp;&nbsp;&nbsp;<br>月&nbsp;&nbsp;&nbsp;&nbsp;<br>28&nbsp;&nbsp;&nbsp;<br>日&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;<br>千葉&nbsp;&nbsp;&nbsp;<br>県&nbsp;&nbsp;&nbsp;&nbsp;<br>円空&nbsp;&nbsp;&nbsp;<br>山&nbsp;&nbsp;&nbsp;&nbsp;<br>で&nbsp;&nbsp;&nbsp;&nbsp;<br>生まれ&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;<br>ゲーム&nbsp;&nbsp;<br>制作&nbsp;&nbsp;&nbsp;<br>会社&nbsp;&nbsp;&nbsp;<br>「&nbsp;&nbsp;&nbsp;&nbsp;<br>ノーツ&nbsp;&nbsp;<br>」&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;<br>設立&nbsp;&nbsp;&nbsp;<br>者&nbsp;&nbsp;&nbsp;&nbsp;<br>だ&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">NER&nbsp;Type&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────────&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┴►PERSON&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►DATE&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►LOCATION&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>───►ORGANIZATION&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;<br>────&nbsp;<br>奈須&nbsp;&nbsp;&nbsp;<br>きのこ&nbsp;&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;<br>1973&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;<br>11&nbsp;&nbsp;&nbsp;<br>月&nbsp;&nbsp;&nbsp;&nbsp;<br>28&nbsp;&nbsp;&nbsp;<br>日&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;<br>千葉&nbsp;&nbsp;&nbsp;<br>県&nbsp;&nbsp;&nbsp;&nbsp;<br>円空&nbsp;&nbsp;&nbsp;<br>山&nbsp;&nbsp;&nbsp;&nbsp;<br>で&nbsp;&nbsp;&nbsp;&nbsp;<br>生まれ&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;<br>ゲーム&nbsp;&nbsp;<br>制作&nbsp;&nbsp;&nbsp;<br>会社&nbsp;&nbsp;&nbsp;<br>「&nbsp;&nbsp;&nbsp;&nbsp;<br>ノーツ&nbsp;&nbsp;<br>」&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;<br>設立&nbsp;&nbsp;&nbsp;<br>者&nbsp;&nbsp;&nbsp;&nbsp;<br>だ&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA1&nbsp;&nbsp;<br>────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┴►ノ？&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>╟──►PRED&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;<br>────&nbsp;<br>奈須&nbsp;&nbsp;&nbsp;<br>きのこ&nbsp;&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;<br>1973&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;<br>11&nbsp;&nbsp;&nbsp;<br>月&nbsp;&nbsp;&nbsp;&nbsp;<br>28&nbsp;&nbsp;&nbsp;<br>日&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;<br>千葉&nbsp;&nbsp;&nbsp;<br>県&nbsp;&nbsp;&nbsp;&nbsp;<br>円空&nbsp;&nbsp;&nbsp;<br>山&nbsp;&nbsp;&nbsp;&nbsp;<br>で&nbsp;&nbsp;&nbsp;&nbsp;<br>生まれ&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;<br>ゲーム&nbsp;&nbsp;<br>制作&nbsp;&nbsp;&nbsp;<br>会社&nbsp;&nbsp;&nbsp;<br>「&nbsp;&nbsp;&nbsp;&nbsp;<br>ノーツ&nbsp;&nbsp;<br>」&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;<br>設立&nbsp;&nbsp;&nbsp;<br>者&nbsp;&nbsp;&nbsp;&nbsp;<br>だ&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA2&nbsp;&nbsp;<br>────────&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►ガ&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►時間&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►デ&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>╟──►PRED&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;<br>────&nbsp;<br>奈須&nbsp;&nbsp;&nbsp;<br>きのこ&nbsp;&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;<br>1973&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;<br>11&nbsp;&nbsp;&nbsp;<br>月&nbsp;&nbsp;&nbsp;&nbsp;<br>28&nbsp;&nbsp;&nbsp;<br>日&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;<br>千葉&nbsp;&nbsp;&nbsp;<br>県&nbsp;&nbsp;&nbsp;&nbsp;<br>円空&nbsp;&nbsp;&nbsp;<br>山&nbsp;&nbsp;&nbsp;&nbsp;<br>で&nbsp;&nbsp;&nbsp;&nbsp;<br>生まれ&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;<br>ゲーム&nbsp;&nbsp;<br>制作&nbsp;&nbsp;&nbsp;<br>会社&nbsp;&nbsp;&nbsp;<br>「&nbsp;&nbsp;&nbsp;&nbsp;<br>ノーツ&nbsp;&nbsp;<br>」&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;<br>設立&nbsp;&nbsp;&nbsp;<br>者&nbsp;&nbsp;&nbsp;&nbsp;<br>だ&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA3&nbsp;&nbsp;<br>────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┴►ノ&nbsp;&nbsp;&nbsp;&nbsp;<br>╟──►PRED&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;<br>────&nbsp;<br>奈須&nbsp;&nbsp;&nbsp;<br>きのこ&nbsp;&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;<br>1973&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;<br>11&nbsp;&nbsp;&nbsp;<br>月&nbsp;&nbsp;&nbsp;&nbsp;<br>28&nbsp;&nbsp;&nbsp;<br>日&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;<br>千葉&nbsp;&nbsp;&nbsp;<br>県&nbsp;&nbsp;&nbsp;&nbsp;<br>円空&nbsp;&nbsp;&nbsp;<br>山&nbsp;&nbsp;&nbsp;&nbsp;<br>で&nbsp;&nbsp;&nbsp;&nbsp;<br>生まれ&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;<br>ゲーム&nbsp;&nbsp;<br>制作&nbsp;&nbsp;&nbsp;<br>会社&nbsp;&nbsp;&nbsp;<br>「&nbsp;&nbsp;&nbsp;&nbsp;<br>ノーツ&nbsp;&nbsp;<br>」&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;<br>設立&nbsp;&nbsp;&nbsp;<br>者&nbsp;&nbsp;&nbsp;&nbsp;<br>だ&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA4&nbsp;&nbsp;<br>────────&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►ガ&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►ヲ&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>╟──►PRED&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;<br>────&nbsp;<br>奈須&nbsp;&nbsp;&nbsp;<br>きのこ&nbsp;&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;<br>1973&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;<br>11&nbsp;&nbsp;&nbsp;<br>月&nbsp;&nbsp;&nbsp;&nbsp;<br>28&nbsp;&nbsp;&nbsp;<br>日&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;<br>千葉&nbsp;&nbsp;&nbsp;<br>県&nbsp;&nbsp;&nbsp;&nbsp;<br>円空&nbsp;&nbsp;&nbsp;<br>山&nbsp;&nbsp;&nbsp;&nbsp;<br>で&nbsp;&nbsp;&nbsp;&nbsp;<br>生まれ&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;<br>ゲーム&nbsp;&nbsp;<br>制作&nbsp;&nbsp;&nbsp;<br>会社&nbsp;&nbsp;&nbsp;<br>「&nbsp;&nbsp;&nbsp;&nbsp;<br>ノーツ&nbsp;&nbsp;<br>」&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;<br>設立&nbsp;&nbsp;&nbsp;<br>者&nbsp;&nbsp;&nbsp;&nbsp;<br>だ&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA5&nbsp;&nbsp;<br>────────&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►ガ&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>╟──►PRED&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;<br>────&nbsp;<br>奈須&nbsp;&nbsp;&nbsp;<br>きのこ&nbsp;&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;<br>1973&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;<br>11&nbsp;&nbsp;&nbsp;<br>月&nbsp;&nbsp;&nbsp;&nbsp;<br>28&nbsp;&nbsp;&nbsp;<br>日&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;<br>千葉&nbsp;&nbsp;&nbsp;<br>県&nbsp;&nbsp;&nbsp;&nbsp;<br>円空&nbsp;&nbsp;&nbsp;<br>山&nbsp;&nbsp;&nbsp;&nbsp;<br>で&nbsp;&nbsp;&nbsp;&nbsp;<br>生まれ&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;<br>ゲーム&nbsp;&nbsp;<br>制作&nbsp;&nbsp;&nbsp;<br>会社&nbsp;&nbsp;&nbsp;<br>「&nbsp;&nbsp;&nbsp;&nbsp;<br>ノーツ&nbsp;&nbsp;<br>」&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;<br>設立&nbsp;&nbsp;&nbsp;<br>者&nbsp;&nbsp;&nbsp;&nbsp;<br>だ&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">PoS&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;9&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;10&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;11<br>───────────────────────────────────────────────────────────────────────────<br>NPR──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NPR──┴►NP&nbsp;─────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>P&nbsp;─────────&nbsp;───┴────────────────────────────────────────────────►PP&nbsp;───┐&nbsp;&nbsp;&nbsp;<br>NUM──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>CL&nbsp;──┴►NUMCLP──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NUM──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>CL&nbsp;──┴►NUMCLP──┼►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NUM──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>CL&nbsp;──┴►NUMCLP──┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►PP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>P&nbsp;─────────&nbsp;───────────┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NPR──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NPR──┴►PP&nbsp;─────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NPR────────&nbsp;&nbsp;&nbsp;&nbsp;├►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├────────────────────────────────►IP────┤&nbsp;&nbsp;&nbsp;<br>NPR────────&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►PP────┤&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>P&nbsp;─────────&nbsp;───────────┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>VB&nbsp;────────&nbsp;───────────────────┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP<br>PU&nbsp;────────&nbsp;───────────────────────────────────────────────────────────┤&nbsp;&nbsp;&nbsp;<br>N&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>N&nbsp;───┴►NP&nbsp;──────►PRN&nbsp;──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>N&nbsp;─────────&nbsp;───────────┴►NP&nbsp;────►PRN&nbsp;──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>PUL────────&nbsp;───────────────────────────┤&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NPR────────&nbsp;───────────────────────────┼►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>PUR────────&nbsp;───────────────────────────┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►PP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>P&nbsp;─────────&nbsp;───────────────────────────────────┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>N&nbsp;─────────&nbsp;───────────────────────────────────────────┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►NP────┤&nbsp;&nbsp;&nbsp;<br>N&nbsp;─────────&nbsp;───────────────────────────────────────────────────┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>AX&nbsp;────────&nbsp;───────────────────────────────────────────────────────────┤&nbsp;&nbsp;&nbsp;<br>PU&nbsp;────────&nbsp;───────────────────────────────────────────────────────────┘&nbsp;&nbsp;&nbsp;</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {
      "tags": []
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "ja(['2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',\n",
    "    '奈須きのこは1973年11月28日に千葉県円空山で生まれ、ゲーム制作会社「ノーツ」の設立者だ。',]).pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "NifrOGlNK7KD"
   },
   "source": [
    "以及支持[130种语言](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/mtl.html#hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L6)的多语种联合模型："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000
    },
    "id": "ae-4j5sbK7KD",
    "outputId": "2777cc5d-c1c5-4091-b754-0c220dafea8a"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": []
    },
    {
     "data": {
      "text/html": [
       "<div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Dep&nbsp;Tree&nbsp;&nbsp;&nbsp;<br>──────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;┌─►├──&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;└─►&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;┌─►&nbsp;<br>┌┬┬─┴──┴──&nbsp;<br>│││&nbsp;&nbsp;┌───►&nbsp;<br>│││&nbsp;&nbsp;│┌──►&nbsp;<br>│││&nbsp;&nbsp;││┌─►&nbsp;<br>││└─►└┴┴──&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;┌──►&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;│┌─►&nbsp;<br>│└───►└┴──&nbsp;<br>└────────►&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Token&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────────&nbsp;<br>In&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>delivers&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>state-of-the-art&nbsp;<br>multilingual&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>techniques&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>to&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>production&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>environments&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>.&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Relation&nbsp;<br>────────&nbsp;<br>case&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>obl&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>punct&nbsp;&nbsp;&nbsp;&nbsp;<br>nsubj&nbsp;&nbsp;&nbsp;&nbsp;<br>root&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>amod&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>amod&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>compound&nbsp;<br>obj&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>case&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>compound&nbsp;<br>obl&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>punct&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Lemma&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────────&nbsp;<br>in&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HANlpv2.1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>deliver&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>state-of-the-art&nbsp;<br>multilingual&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>technique&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>to&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>production&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>environment&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>.&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">PoS&nbsp;&nbsp;&nbsp;<br>─────&nbsp;<br>ADP&nbsp;&nbsp;&nbsp;<br>NUM&nbsp;&nbsp;&nbsp;<br>PUNCT&nbsp;<br>PROPN&nbsp;<br>VERB&nbsp;&nbsp;<br>ADJ&nbsp;&nbsp;&nbsp;<br>ADJ&nbsp;&nbsp;&nbsp;<br>PROPN&nbsp;<br>NOUN&nbsp;&nbsp;<br>ADP&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>PUNCT&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────────&nbsp;<br>In&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>delivers&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>state-of-the-art&nbsp;<br>multilingual&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>techniques&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>to&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>production&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>environments&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>.&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">NER&nbsp;Type&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>───────────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>───►DATE&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>───►WORK_OF_ART&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────────&nbsp;<br>In&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>delivers&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>state-of-the-art&nbsp;<br>multilingual&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>techniques&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>to&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>production&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>environments&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>.&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┴►ARGM-TMP&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>───►ARG0&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>╟──►PRED&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;├►ARG2&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────────&nbsp;<br>In&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>,&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>delivers&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>state-of-the-art&nbsp;<br>multilingual&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>techniques&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>to&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>production&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>environments&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>.&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">PoS&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6<br>──────────────────────────────────<br>ADP&nbsp;───────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NUM&nbsp;────►NP&nbsp;───┴────────►PP&nbsp;───┐&nbsp;&nbsp;<br>PUNCT──────────────────────────┤&nbsp;&nbsp;<br>PROPN───────────────────►NP────┤&nbsp;&nbsp;<br>VERB&nbsp;──────────────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;<br>ADJ&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;<br>ADJ&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;<br>PROPN&nbsp;&nbsp;├────────►NP────┼►VP────┼►S<br>NOUN&nbsp;──┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;<br>ADP&nbsp;───────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;<br>NOUN&nbsp;──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►PP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;<br>NOUN&nbsp;──┴►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;<br>PUNCT──────────────────────────┘&nbsp;&nbsp;</pre></div><br><div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Dep&nbsp;Tree&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>┌────────►├──&nbsp;<br>│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;└─►&nbsp;<br>│┌───────►┌──&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;└─►&nbsp;<br>││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>││&nbsp;&nbsp;&nbsp;┌───►├──&nbsp;<br>││&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;└─►&nbsp;<br>││&nbsp;&nbsp;&nbsp;│┌─────►&nbsp;<br>││&nbsp;&nbsp;&nbsp;││┌────►&nbsp;<br>││&nbsp;&nbsp;&nbsp;│││┌───►&nbsp;<br>││&nbsp;&nbsp;&nbsp;││││┌──►&nbsp;<br>││&nbsp;&nbsp;&nbsp;│││││┌─►&nbsp;<br>││┌─►└┴┴┴┴┼──&nbsp;<br>│││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;└─►&nbsp;<br>│││&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>│││&nbsp;&nbsp;&nbsp;&nbsp;┌─►├──&nbsp;<br>│││&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;└─►&nbsp;<br>└┴┴────┴─┬┬──&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│└─►&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;└──►&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Token&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先端&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>言語&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技術&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>を&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>本番&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>環境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>導入&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>します&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Relation&nbsp;<br>────────&nbsp;<br>nummod&nbsp;&nbsp;&nbsp;<br>obl&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>punct&nbsp;&nbsp;&nbsp;&nbsp;<br>nsubj&nbsp;&nbsp;&nbsp;&nbsp;<br>case&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>compound&nbsp;<br>nmod&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>case&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>compound&nbsp;<br>compound&nbsp;<br>compound&nbsp;<br>compound&nbsp;<br>compound&nbsp;<br>obj&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>case&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>compound&nbsp;<br>obl&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>case&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>root&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>aux&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>punct&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Lemma&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HANLPV2.1&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先端&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>言語&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技術&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>を&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>本番&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>環境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>導入&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>します&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">PoS&nbsp;&nbsp;&nbsp;<br>─────&nbsp;<br>NUM&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>PUNCT&nbsp;<br>NOUN&nbsp;&nbsp;<br>ADP&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>ADP&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>ADP&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>ADP&nbsp;&nbsp;&nbsp;<br>VERB&nbsp;&nbsp;<br>AUX&nbsp;&nbsp;&nbsp;<br>PUNCT&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先端&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>言語&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技術&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>を&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>本番&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>環境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>導入&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>します&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">NER&nbsp;Type&nbsp;<br>────────&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┴►DATE&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>、&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>は&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>の&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先端&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>言語&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技術&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>を&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>本番&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>環境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>に&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>導入&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>します&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">PoS&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;9&nbsp;<br>───────────────────────────────────────────────────────────<br>NUM&nbsp;───────────────────────────────────────────────────┐&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;──────────────────────────────────────────────────┤&nbsp;&nbsp;&nbsp;<br>PUNCT──────────────────────────────────────────────────┤&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;──────────────────────────────────────────────────┤&nbsp;&nbsp;&nbsp;<br>ADP&nbsp;───────────────────────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;──────────────────────────┤&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;──────────────────────────┤&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>ADP&nbsp;───────────────────────────┼►VP&nbsp;────►VP&nbsp;────►IP────┤&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;───►ADJP──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;───►ADJP──┴►ADJP──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;───────────►ADJP──┴►ADJP──┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP<br>NOUN&nbsp;──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;&nbsp;├►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;──┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>ADP&nbsp;───────────┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;──────────────────┼►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;──────────────────┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>ADP&nbsp;────────────────────►PP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>VERB&nbsp;──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├────────►NP────┤&nbsp;&nbsp;&nbsp;<br>AUX&nbsp;───┴────────────────────────►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>PUNCT──────────────────────────────────────────────────┘&nbsp;&nbsp;&nbsp;</pre></div><br><div style=\"display: table; line-height: 128%;\"><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Dep&nbsp;Tree&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>&nbsp;&nbsp;&nbsp;┌────►└──&nbsp;<br>&nbsp;&nbsp;&nbsp;│┌──────►&nbsp;<br>&nbsp;&nbsp;&nbsp;││&nbsp;&nbsp;&nbsp;┌──►&nbsp;<br>&nbsp;&nbsp;&nbsp;││&nbsp;&nbsp;&nbsp;│┌─►&nbsp;<br>&nbsp;&nbsp;&nbsp;││┌─►└┴──&nbsp;<br>┌┬─┴┴┴──────&nbsp;<br>││&nbsp;&nbsp;┌──────►&nbsp;<br>││&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>││&nbsp;&nbsp;│┌──►└──&nbsp;<br>││&nbsp;&nbsp;││&nbsp;&nbsp;&nbsp;┌─►&nbsp;<br>││&nbsp;&nbsp;││┌─►└──&nbsp;<br>││&nbsp;&nbsp;│││&nbsp;&nbsp;┌─►&nbsp;<br>│└─►└┴┴──┴──&nbsp;<br>└──────────►&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Token&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Relation&nbsp;&nbsp;<br>─────────&nbsp;<br>nummod&nbsp;&nbsp;&nbsp;&nbsp;<br>nmod:tmod&nbsp;<br>nsubj&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>case&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>nmod&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>obl&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>root&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>nmod&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>advmod&nbsp;&nbsp;&nbsp;&nbsp;<br>amod&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>nummod&nbsp;&nbsp;&nbsp;&nbsp;<br>nmod&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>nmod&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>obj&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>punct&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Lemma&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HANlpv2.1&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">PoS&nbsp;&nbsp;&nbsp;<br>─────&nbsp;<br>NUM&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>X&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>ADP&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>VERB&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>ADV&nbsp;&nbsp;&nbsp;<br>ADJ&nbsp;&nbsp;&nbsp;<br>NUM&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>X&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;&nbsp;<br>PUNCT&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">NER&nbsp;Type&nbsp;&nbsp;&nbsp;<br>──────────&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┴►DATE&nbsp;&nbsp;&nbsp;<br>───►PERSON&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">SRL&nbsp;PA1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>────────────&nbsp;<br>◄─┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>◄─┴►ARGM-TMP&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>╟──►PRED&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">Tok&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>─────────&nbsp;<br>2021&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>年&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>HanLPv2.1&nbsp;<br>为&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>生产&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>环境&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>带来&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>次世代&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>最&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>先进的&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>多&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>语种&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NLP&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>技术&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>。&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</pre><pre style=\"display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace; white-space: nowrap;\">PoS&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;3&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;5&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;6&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;7&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;8&nbsp;<br>───────────────────────────────────────────────────<br>NUM&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;──┴────────────────────────────────►NP&nbsp;───┐&nbsp;&nbsp;&nbsp;<br>X&nbsp;──────────────────────────────────────►NP────┤&nbsp;&nbsp;&nbsp;<br>ADP&nbsp;───────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├────────────────►PP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;──┴►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>VERB&nbsp;──────────────────────────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP────┤&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;───────────►ADJP──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>ADV&nbsp;────►ADVP──┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►VP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►IP<br>ADJ&nbsp;────►ADJP──┴►ADJP──┤&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NUM&nbsp;────►QP&nbsp;───┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;├►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;───►NP&nbsp;───┴►NP────┤&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>X&nbsp;─────┐&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>NOUN&nbsp;──┴────────►NP&nbsp;───┘&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;│&nbsp;&nbsp;&nbsp;<br>PUNCT──────────────────────────────────────────┘&nbsp;&nbsp;&nbsp;</pre></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {
      "tags": []
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "from hanlp.utils.torch_util import gpus_available\n",
    "if gpus_available(): # 建议在GPU上运行XLMR_BASE，否则运行mini模型\n",
    "    mul = hanlp.load(hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE)\n",
    "else:\n",
    "    if 'ja' in globals(): # Binder内存只有2G，释放已加载的模型\n",
    "        del ja\n",
    "    mul = hanlp.load(hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L6)\n",
    "mul(['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.',\n",
    "     '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',\n",
    "     '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。']).pretty_print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "0QV_93CjK7KD"
   },
   "source": [
    "你可以在下面输入你想执行的代码~"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "tutorial.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: plugins/hanlp_demo/setup.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 19:26
from os.path import abspath, join, dirname
from setuptools import find_packages, setup

this_dir = abspath(dirname(__file__))
with open(join(this_dir, 'README.md'), encoding='utf-8') as file:
    long_description = file.read()

setup(
    name='hanlp_demo',
    version='0.0.1',
    description='HanLP: Han Language Processing',
    long_description=long_description,
    long_description_content_type="text/markdown",
    url='https://github.com/hankcs/HanLP',
    author='hankcs',
    author_email='hankcshe@gmail.com',
    license='Apache License 2.0',
    classifiers=[
        'Intended Audience :: Science/Research',
        'Intended Audience :: Developers',
        "Development Status :: 3 - Alpha",
        'Operating System :: OS Independent',
        "License :: OSI Approved :: Apache Software License",
        'Programming Language :: Python :: 3 :: Only',
        'Topic :: Scientific/Engineering :: Artificial Intelligence',
        "Topic :: Text Processing :: Linguistic"
    ],
    keywords='corpus,machine-learning,NLU,NLP',
    packages=find_packages(exclude=['docs', 'tests*']),
    include_package_data=True,
    install_requires=[
        'hanlp_common'
    ],
    python_requires='>=3.6',
)


================================================
FILE: plugins/hanlp_restful/README.md
================================================
# RESTFul API Client for HanLP

[中文](https://github.com/hankcs/HanLP/tree/doc-zh) | [1.x](https://github.com/hankcs/HanLP/tree/1.x) | [forum](https://bbs.hankcs.com/) | [docker](https://github.com/WalterInSH/hanlp-jupyter-docker)

The multilingual NLP library for researchers and companies, built on PyTorch and TensorFlow 2.x, for advancing state-of-the-art deep learning techniques in both academia and industry. HanLP was designed from day one to be efficient, user friendly and extendable. It comes with pretrained models for various human languages including English, Chinese and many others. Currently, HanLP 2.0 is in alpha stage with more killer features on the roadmap. Discussions are welcomed on our [forum](https://bbs.hankcs.com/), while bug reports and feature requests are reserved for GitHub issues. For Java users, please checkout the [1.x](https://github.com/hankcs/HanLP/tree/1.x) branch.


## Installation

```bash
pip install hanlp-restful
```

## License

HanLP is licensed under **Apache License 2.0**. You can use HanLP in your commercial products for free. We would appreciate it if you add a link to HanLP on your website.


================================================
FILE: plugins/hanlp_restful/hanlp_restful/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-29 17:48
import json
from typing import Union, List, Optional, Dict, Any, Tuple
from urllib.error import HTTPError
from urllib.parse import urlencode
from urllib.request import Request, urlopen
from hanlp_common.document import Document

try:
    # noinspection PyUnresolvedReferences
    import requests


    def _post(url, form: Dict[str, Any], headers: Dict[str, Any], timeout=60, verify=True) -> str:
        response = requests.post(url, json=form, headers=headers, timeout=timeout, verify=verify)
        if response.status_code != 200:
            raise HTTPError(url, response.status_code, response.text, response.headers, None)
        return response.text
except ImportError:
    import ssl


    def _post(url, form: Dict[str, Any], headers: Dict[str, Any], timeout=60, verify=True) -> str:
        request = Request(url, json.dumps(form).encode())
        for k, v in headers.items():
            request.add_header(k, v)
        ctx = None
        if not verify:
            ctx = ssl.create_default_context()
            ctx.check_hostname = False
            ctx.verify_mode = ssl.CERT_NONE
        return urlopen(request, timeout=timeout, context=ctx).read().decode()


class HanLPClient(object):

    def __init__(self, url: str, auth: str = None, language=None, timeout=60, verify=True) -> None:
        """

        Args:
            url (str): An API endpoint to a service provider.
            auth (str): An auth key licenced from a service provider.
            language (str): The default language for each :func:`~hanlp_restful.HanLPClient.parse` call.
                Contact the service provider for the list of languages supported.
                Conventionally, ``zh`` is used for Chinese and ``mul`` for multilingual.
                Leave ``None`` to use the default language on server.
            timeout (int): Maximum waiting time in seconds for a request.
            verify (bool): ``True`` to enable SSL cert verification. You can also pass ``verify`` the path to a CA_BUNDLE
                file or directory with certificates of trusted CAs (``requests`` required).
        """
        super().__init__()
        self._language = language
        self._timeout = timeout
        self._url = url
        if auth is None:
            import os
            auth = os.getenv('HANLP_AUTH', None)
        self._auth = auth
        self._verify = verify

    def parse(self,
              text: Union[str, List[str]] = None,
              tokens: List[List[str]] = None,
              tasks: Optional[Union[str, List[str]]] = None,
              skip_tasks: Optional[Union[str, List[str]]] = None,
              language: str = None,
              ) -> Document:
        """
        Parse a piece of text.

        Args:
            text: A document (str), or a list of sentences (List[str]).
            tokens: A list of sentences where each sentence is a list of tokens.
            tasks: The tasks to predict. Use ``tasks=[...]`` to run selected tasks only. Dependent tasks will be
                automatically selected.
            skip_tasks: The tasks to skip. Use ``skip_tasks='tok/fine'`` to enable coarse tokenization for all tasks.
                Use ``tasks=['tok/coarse', ...]`` and ``skip_tasks='tok/fine'`` to enable coarse tokenization for
                selected tasks.
            language: The language of input text or tokens. ``None`` to use the default language on server.

        Returns:
            A :class:`~hanlp_common.document.Document`.

        Examples::

            # Use tasks=[...] to run selected tasks only
            HanLP('晓美焰来到自然语义科技公司', tasks=['pos', 'ner'])

            # Use skip_tasks='tok/fine' to enable coarse tokenization for all tasks
            HanLP('晓美焰来到自然语义科技公司', skip_tasks='tok/fine')

            # Use tasks=['tok/coarse', ...] and skip_tasks='tok/fine' to enable
            # coarse tokenization for selected tasks
            HanLP('晓美焰来到自然语义科技公司', tasks=['tok/coarse','pos'],skip_tasks='tok/fine')


        Raises:
            HTTPError: Any errors happening on the Internet side or the server side. Refer to the ``code`` and ``msg``
                of the exception for more details. A list of common errors :

        - ``400 Bad Request`` indicates that the server cannot process the request due to a client
          fault (e.g., text too long, language unsupported).
        - ``401 Unauthorized`` indicates that the request lacks **valid** ``auth`` credentials for the API.
        - ``422 Unprocessable Entity`` indicates that the content type of the request entity is not in
          proper json format.
        - ``429 Too Many Requests`` indicates the user has sent too many requests in a given
          amount of time ("rate limiting").

        """
        assert text or tokens, 'At least one of text or tokens has to be specified.'
        response = self._send_post_json(self._url + '/parse', {
            'text': text,
            'tokens': tokens,
            'tasks': tasks,
            'skip_tasks': skip_tasks,
            'language': language or self._language
        })
        return Document(response)

    def __call__(self,
                 text: Union[str, List[str]] = None,
                 tokens: List[List[str]] = None,
                 tasks: Optional[Union[str, List[str]]] = None,
                 skip_tasks: Optional[Union[str, List[str]]] = None,
                 language: str = None,
                 ) -> Document:
        """
        A shortcut of :meth:`~hanlp_restful.HanLPClient.parse`.
        """
        return self.parse(text, tokens, tasks, skip_tasks, language)

    def about(self) -> Dict[str, Any]:
        """Get the information about server and your client.

        Returns:
            A dict containing your rate limit and server version etc.

        """
        info = self._send_get_json(self._url + '/about', {})
        return Document(info)

    def _send_post(self, url, form: Dict[str, Any]):
        request = Request(url, json.dumps(form).encode())
        self._add_headers(request)
        return self._fire_request(request)

    def _fire_request(self, request):
        return urlopen(request, timeout=self._timeout).read().decode()

    def _send_post_json(self, url, form: Dict[str, Any]):
        headers = dict()
        if self._auth:
            headers['Authorization'] = f'Basic {self._auth}'
        return json.loads(_post(url, form, headers, self._timeout, verify=self._verify))

    def _send_get(self, url, form: Dict[str, Any]):
        request = Request(url + '?' + urlencode(form))
        self._add_headers(request)
        return self._fire_request(request)

    def _add_headers(self, request):
        if self._auth:
            request.add_header('Authorization', f'Basic {self._auth}')

    def _send_get_json(self, url, form: Dict[str, Any]):
        return json.loads(self._send_get(url, form))

    def text_style_transfer(self, text: Union[str, List[str]], target_style: str, language: str = None) \
            -> Union[str, List[str]]:
        """ Text style transfer aims to change the style of the input text to the target style while preserving its
        content.

        Args:
            text: Source text.
            target_style: Target style.
            language: The language of input text. ``None`` to use the default language.

        Returns:
            Text or a list of text of the target style.

        Examples::

            HanLP.text_style_transfer(['国家对中石油抱有很大的期望.', '要用创新去推动高质量的发展。'],
                                      target_style='gov_doc')
            # Output:
            [
                '国家对中石油寄予厚望。',
                '要以创新驱动高质量发展。'
            ]

            HanLP.text_style_transfer('我看到了窗户外面有白色的云和绿色的森林',
                                      target_style='modern_poetry')
            # Output:
            '我看见窗外的白云绿林'
        """
        response = self._send_post_json(self._url + '/text_style_transfer',
                                        {'text': text, 'target_style': target_style,
                                         'language': language or self._language})
        return response

    def semantic_textual_similarity(self, text: Union[Tuple[str, str], List[Tuple[str, str]]], language: str = None) \
            -> Union[float, List[float]]:
        """ Semantic textual similarity deals with determining how similar two pieces of texts are.

        Args:
            text: A pair or pairs of text.
            language: The language of input text. ``None`` to use the default language.

        Returns:
            Similarities.

        Examples::

            HanLP.semantic_textual_similarity([
                ('看图猜一电影名', '看图猜电影'),
                ('无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'),
                ('北京到上海的动车票', '上海到北京的动车票'),
            ])
            # Output:
            [
                0.9764469, # Similarity of ('看图猜一电影名', '看图猜电影')
                0.0,       # Similarity of ('无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用')
                0.0034587  # Similarity of ('北京到上海的动车票', '上海到北京的动车票')
            ]
        """
        response = self._send_post_json(self._url + '/semantic_textual_similarity',
                                        {'text': text, 'language': language or self._language})
        return response

    def coreference_resolution(self, text: Optional[str] = None, tokens: Optional[List[List[str]]] = None,
                               speakers: Optional[List[str]] = None, language: Optional[str] = None) -> Union[
        Dict[str, Union[List[str], List[List[Tuple[str, int, int]]]]], List[List[Tuple[str, int, int]]]]:
        r""" Coreference resolution is the task of clustering mentions in text that refer to the same underlying
        real world entities.

        Args:
            text: A piece of text, usually a document without tokenization.
            tokens: A list of sentences where each sentence is a list of tokens.
            speakers: A list of speakers where each speaker is a ``str`` representing the speaker's ID, e.g., ``Tom``.
            language: The language of input text. ``None`` to use the default language.

        Returns:
            When ``text`` is specified, return the clusters and tokens. Otherwise just the clusters, In this case, you need to ``sum(tokens, [])`` in order to match the span indices with tokens

        Examples::

            HanLP.coreference_resolution('我姐送我她的猫。我很喜欢它。')
            # Output:
            {'clusters': [
                          [['我', 0, 1], ['我', 3, 4], ['我', 8, 9]], # 指代说话人
                          [['我姐', 0, 2], ['她', 4, 5]],             # 指代说话人的姐姐
                          [['她的猫', 4, 7], ['它', 11, 12]]],        # 指代说话人的姐姐的猫
             'tokens': ['我', '姐', '送', '我', '她', '的', '猫', '。',
                        '我', '很', '喜欢', '它', '。']}

            HanLP.coreference_resolution(
            tokens=[['我', '姐', '送', '我', '她', '的', '猫', '。'],
                    ['我', '很', '喜欢', '它', '。']])
            # Output:
                         [
                          [['我', 0, 1], ['我', 3, 4], ['我', 8, 9]], # 指代说话人
                          [['我姐', 0, 2], ['她', 4, 5]],             # 指代说话人的姐姐
                          [['她的猫', 4, 7], ['它', 11, 12]]],        # 指代说话人的姐姐的猫

        .. image:: https://file.hankcs.com/img/coref_demo_small.png
            :alt: Coreference resolution visualization
        """
        response = self._send_post_json(self._url + '/coreference_resolution',
                                        {'text': text, 'tokens': tokens, 'speakers': speakers,
                                         'language': language or self._language})
        return response

    def tokenize(self, text: Union[str, List[str]], coarse: Optional[bool] = None, language=None) -> List[List[str]]:
        """ Split a document into sentences and tokenize them. Note that it is always faster to tokenize a whole
        document than to tokenize each sentence one by one. So avoid calling this method sentence by sentence but put
        sentences into a ``list`` and pass them to the ``text`` argument.

        Args:
            text: A document (``str``), or a list of sentences (``List[str]``).
            coarse: Whether to perform coarse-grained or fine-grained tokenization.
            language: The language of input text. ``None`` to use the default language.

        Returns:
            A list of tokenized sentences.

        Examples::

            # Avoid tokenizing sentence by sentence, it is expensive:
            HanLP.tokenize('商品和服务。')
            [['商品', '和', '服务', '。']]
            HanLP.tokenize('阿婆主来到北京立方庭参观自然语义科技公司')
            [['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司']]

            # Instead, the following codes are much faster:
            HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司')
            [['商品', '和', '服务', '。'],
             ['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司']]

            # To tokenize with coarse-grained standard:
            HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司', coarse=True)
            [['商品', '和', '服务', '。'],
             ['阿婆主', '来到', '北京', '立方庭', '参观', '自然语义科技公司']]

            # To tokenize pre-segmented sentences:
            HanLP.tokenize(['商品和服务。', '当下雨天地面积水分外严重'])
            [['商品', '和', '服务', '。'],
             ['当', '下雨天', '地面', '积水', '分', '外', '严重']]

            # Multilingual tokenization by specifying language='mul':
            HanLP.tokenize(
                ['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques
                 'to production environment.',
                 '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
                 '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'], language='mul')
            [['In', '2021', ',', 'HanLPv2.1', 'delivers', 'state-of-the-art', 'multilingual',
              'NLP', 'techniques', 'to', 'production', 'environment', '.'],
             ['2021', '年', '、', 'HanLPv2.1', 'は', '次', '世代', 'の', '最', '先端', '多',
              '言語', 'NLP', '技術', 'を', '本番', '環境', 'に', '導入', 'します', '。'],
             ['2021', '年', 'HanLPv2.1', '为', '生产', '环境', '带来', '次世代', '最', '先进的',
              '多', '语种', 'NLP', '技术', '。']]
        """
        language = language or self._language
        if coarse and language and language != 'zh':
            raise NotImplementedError(f'Coarse tokenization not supported for {language}. Please set language="zh".')
        doc = self.parse(text=text, tasks='tok/coarse' if coarse is True else 'tok', language=language)
        return next(iter(doc.values()))

    def abstract_meaning_representation(self,
                                        text: Union[str, List[str]] = None,
                                        tokens: List[List[str]] = None,
                                        language: str = None,
                                        visualization: str = None,
                                        ) -> List[Dict]:
        """Abstract Meaning Representation (AMR) captures “who is doing what to whom” in a sentence. Each sentence is
        represented as a rooted, directed, acyclic graph consisting of nodes (concepts) and edges (relations).

        Args:
            text: A document (str), or a list of sentences (List[str]).
            tokens: A list of sentences where each sentence is a list of tokens.
            language: The language of input text or tokens. ``None`` to use the default language on server.
            visualization: Set to `dot` or `svg` to obtain coresspodning visualization.

        Returns:
            Graphs in meaning represenation format.

        Examples::

            HanLP.abstract_meaning_representation('男孩希望女孩相信他。')
            HanLP.abstract_meaning_representation('The boy wants the girl to believe him.',
                                                  language='en')

        .. image:: https://hanlp.hankcs.com/backend/v2/amr_svg?tokens=%E7%94%B7%E5%AD%A9%20%E5%B8%8C%E6%9C%9B%20%E5%A5%B3%E5%AD%A9%20%E7%9B%B8%E4%BF%A1%20%E4%BB%96%20%E3%80%82&language=zh&scale=1
            :alt: Abstract Meaning Representation

        .. image:: https://hanlp.hankcs.com/backend/v2/amr_svg?tokens=The%20boy%20wants%20the%20girl%20to%20believe%20him%20.&language=en&scale=1
            :alt: Abstract Meaning Representation

        """
        assert text or tokens, 'At least one of text or tokens has to be specified.'
        return self._send_post_json(self._url + '/abstract_meaning_representation', {
            'text': text,
            'tokens': tokens,
            'language': language or self._language,
            'visualization': visualization,
        })

    def keyphrase_extraction(
            self,
            text: str,
            topk: int = 10,
            language: str = None,
    ) -> Dict[str, float]:
        """ Keyphrase extraction aims to identify keywords or phrases reflecting the main topics of a document.

        Args:
            text: The text content of the document. Preferably the concatenation of the title and the content.
            topk: The number of top-K ranked keywords or keyphrases.
            language: The language of input text or tokens. ``None`` to use the default language on server.

        Returns:
            A dictionary containing each keyword or keyphrase and its ranking score :math:`s`, :math:`s \in [0, 1]`.

        Examples::

            HanLP.keyphrase_extraction(
                '自然语言处理是一门博大精深的学科，掌握理论才能发挥出HanLP的全部性能。 '
                '《自然语言处理入门》是一本配套HanLP的NLP入门书，助你零起点上手自然语言处理。', topk=3)
            # Output:
            {'自然语言处理': 0.800000011920929,
             'HanLP的全部性能': 0.5258446335792542,
             '一门博大精深的学科': 0.421421080827713}
        """
        assert text, 'Text has to be specified.'
        return self._send_post_json(self._url + '/keyphrase_extraction', {
            'text': text,
            'language': language or self._language,
            'topk': topk,
        })

    def extractive_summarization(
            self,
            text: str,
            topk: int = 3,
            language: str = None,
    ) -> Dict[str, float]:
        """ Single document summarization is the task of selecting a subset of the sentences which best
        represents a summary of the document, with a balance of salience and redundancy.

        Args:
            text: The text content of the document.
            topk: The maximum number of top-K ranked sentences. Note that due to Trigram Blocking tricks, the actual
                number of returned sentences could be less than ``topk``.
            language: The language of input text or tokens. ``None`` to use the default language on server.

        Returns:
            A dictionary containing each sentence and its ranking score :math:`s \in [0, 1]`.

        Examples::

            HanLP.extractive_summarization('''
            据DigiTimes报道，在上海疫情趋缓，防疫管控开始放松后，苹果供应商广达正在逐步恢复其中国工厂的MacBook产品生产。
            据供应链消息人士称，生产厂的订单拉动情况正在慢慢转强，这会提高MacBook Pro机型的供应量，并缩短苹果客户在过去几周所经历的延长交货时间。
            仍有许多苹果笔记本用户在等待3月和4月订购的MacBook Pro机型到货，由于苹果的供应问题，他们的发货时间被大大推迟了。
            据分析师郭明錤表示，广达是高端MacBook Pro的唯一供应商，自防疫封控依赖，MacBook Pro大部分型号交货时间增加了三到五周，
            一些高端定制型号的MacBook Pro配置要到6月底到7月初才能交货。
            尽管MacBook Pro的生产逐渐恢复，但供应问题预计依然影响2022年第三季度的产品销售。
            苹果上周表示，防疫措施和元部件短缺将继续使其难以生产足够的产品来满足消费者的强劲需求，这最终将影响苹果6月份的收入。
            ''')
            # Output:
            {'据DigiTimes报道，在上海疫情趋缓，防疫管控开始放松后，苹果供应商广达正在逐步恢复其中国工厂的MacBook产品生产。': 0.9999,
             '仍有许多苹果笔记本用户在等待3月和4月订购的MacBook Pro机型到货，由于苹果的供应问题，他们的发货时间被大大推迟了。': 0.5800,
             '尽管MacBook Pro的生产逐渐恢复，但供应问题预计依然影响2022年第三季度的产品销售。': 0.5422}
        """
        assert text, 'Text has to be non-empty.'
        return self._send_post_json(self._url + '/extractive_summarization', {
            'text': text,
            'language': language or self._language,
            'topk': topk,
        })

    def abstractive_summarization(
            self,
            text: str,
            language: str = None,
    ) -> str:
        r""" Abstractive Summarization is the task of generating a short and concise summary that captures the
        salient ideas of the source text. The generated summaries potentially contain new phrases and sentences that
        may not appear in the source text.

        Args:
            text: The text content of the document.
            language: The language of input text or tokens. ``None`` to use the default language on server.

        Returns:
            Summarization.

        Examples::

            HanLP.abstractive_summarization('''
            每经AI快讯，2月4日，长江证券研究所金属行业首席分析师王鹤涛表示，2023年海外经济衰退，美债现处于历史高位，
            黄金的趋势是值得关注的；在国内需求修复的过程中，看好大金属品种中的铜铝钢。
            此外，在细分的小品种里，建议关注两条主线，一是新能源，比如锂、钴、镍、稀土，二是专精特新主线。（央视财经）
            ''')
            # Output:
            '长江证券：看好大金属品种中的铜铝钢'
        """
        assert text, 'Text has to be non-empty.'
        return self._send_post_json(self._url + '/abstractive_summarization', {
            'text': text,
            'language': language or self._language,
        })

    def grammatical_error_correction(self, text: Union[str, List[str]], language: str = None) \
            -> Union[str, List[str]]:
        """ Grammatical Error Correction (GEC) is the task of correcting different kinds of errors in text such as
        spelling, punctuation, grammatical, and word choice errors.

        Args:
            text: Text potentially containing different kinds of errors such as spelling, punctuation,
                grammatical, and word choice errors.
            language: The language of input text. ``None`` to use the default language.

        Returns:
            Corrected text.

        Examples::

            HanLP.grammatical_error_correction(['每个青年都应当有远大的报复。',
                                                '有的同学对语言很兴趣。'])
            # Output:
            [
                '每个青年都应当有远大的抱负。',
                '有的同学对语言很有兴趣。'
            ]

        """
        response = self._send_post_json(self._url + '/grammatical_error_correction',
                                        {'text': text,
                                         'language': language or self._language})
        return response

    def text_classification(self, text: Union[str, List[str]], model, topk=False, prob=False) -> Union[
        str, Dict[str, float], List[Union[str, Dict[str, float]]]]:
        """
        Text classification is the task of assigning a sentence or document an appropriate category.
        The categories depend on the chosen dataset and can range from topics.

        Args:
            text: A document or a list of documents.
            model: The model to use for prediction.
            topk: ``True`` or ``int`` to return the top-k labels.
            prob: Return also probabilities.

        Returns:

            Classification results.
        """
        response = self._send_post_json(self._url + '/text_classification',
                                        {'text': text, 'model': model, 'topk': topk, 'prob': prob})
        return response

    def sentiment_analysis(self, text: Union[str, List[str]], language=None) -> Union[float, List[float]]:
        r"""
        Sentiment analysis is the task of classifying the polarity of a given text. For instance,
        a text-based tweet can be categorized into either "positive", "negative", or "neutral".

        Args:
            text: A document or a list of documents.
            language (str): The default language for each :func:`~hanlp_restful.HanLPClient.parse` call.
                Contact the service provider for the list of languages supported.
                Conventionally, ``zh`` is used for Chinese and ``mul`` for multilingual.
                Leave ``None`` to use the default language on server.

        Returns:

            Sentiment polarity as a numerical value which measures how positive the sentiment is.

        Examples::

            HanLP.language_identification('''“这是一部男人必看的电影。”人人都这么说。但单纯从性别区分，就会让这电影变狭隘。
            《肖申克的救赎》突破了男人电影的局限，通篇几乎充满令人难以置信的温馨基调，而电影里最伟大的主题是“希望”。
            当我们无奈地遇到了如同肖申克一般囚禁了心灵自由的那种囹圄，我们是无奈的老布鲁克，灰心的瑞德，还是智慧的安迪？
            运用智慧，信任希望，并且勇敢面对恐惧心理，去打败它？
            经典的电影之所以经典，因为他们都在做同一件事——让你从不同的角度来欣赏希望的美好。''')
            0.9505730271339417
        """
        response = self._send_post_json(self._url + '/sentiment_analysis',
                                        {'text': text, 'language': language or self._language})
        return response

    def language_identification(self, text: Union[str, List[str]], topk=False, prob=False) -> Union[
        str, Dict[str, float], List[Union[str, Dict[str, float]]]]:
        """
        Identify the language of a given text.

        Args:
            text: A document or a list of documents.
            topk: ``True`` or ``int`` to return the top-k languages.
            prob: Return also probabilities.

        Returns:

            Identified language in `ISO 639-1 codes`_.

        Examples::

            HanLP.language_identification(
            'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques.')
            'en'
            lang, prob = HanLP.language_identification(
            '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True)
            ('ja', 0.9976244568824768)
            HanLP.language_identification(
            '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2)
            ['zh', 'ja']
            HanLP.language_identification(
            '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=3, prob=True)
            {'zh': 0.3952908217906952, 'en': 0.37189167737960815, 'ja': 0.056213412433862686}

        .. _ISO 639-1 codes:
           https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
        """
        return self.text_classification(text, 'lid', topk, prob)


================================================
FILE: plugins/hanlp_restful/setup.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 19:26
from os.path import abspath, join, dirname
from setuptools import find_packages, setup

this_dir = abspath(dirname(__file__))
with open(join(this_dir, 'README.md'), encoding='utf-8') as file:
    long_description = file.read()

setup(
    name='hanlp_restful',
    version='0.0.23',
    description='HanLP: Han Language Processing',
    long_description=long_description,
    long_description_content_type="text/markdown",
    url='https://github.com/hankcs/HanLP',
    author='hankcs',
    author_email='hankcshe@gmail.com',
    license='Apache License 2.0',
    classifiers=[
        'Intended Audience :: Science/Research',
        'Intended Audience :: Developers',
        "Development Status :: 3 - Alpha",
        'Operating System :: OS Independent',
        "License :: OSI Approved :: Apache Software License",
        'Programming Language :: Python :: 3 :: Only',
        'Topic :: Scientific/Engineering :: Artificial Intelligence',
        "Topic :: Text Processing :: Linguistic"
    ],
    keywords='corpus,machine-learning,NLU,NLP',
    packages=find_packages(exclude=['docs', 'tests*']),
    include_package_data=True,
    install_requires=[
        'hanlp_common'
    ],
    python_requires='>=3.6',
)


================================================
FILE: plugins/hanlp_restful/tests/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-29 18:05


================================================
FILE: plugins/hanlp_restful/tests/test_client.py
================================================
import unittest

from hanlp_restful import HanLPClient


class TestClient(unittest.TestCase):

    def setUp(self) -> None:
        self.HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None)  # Fill in your auth

    def test_raw_text(self):
        text = '2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。'
        doc = self.HanLP.parse(text)

    def test_sents(self):
        text = ['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。',
                '阿婆主来到北京立方庭参观自然语义科技公司。']
        doc = self.HanLP(text)

    def test_tokens(self):
        tokens = [
            ["2021年", "HanLPv2.1", "为", "生产", "环境", "带来", "次", "世代", "最", "先进", "的", "多语种", "NLP", "技术", "。"],
            ["英", "首相", "与", "特朗普", "通", "电话", "讨论", "华为", "与", "苹果", "公司", "。"]
        ]
        doc = self.HanLP(tokens=tokens, tasks=['ner*', 'srl', 'dep'])

    def test_sents_mul(self):
        text = ['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.',
                '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
                '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。']
        doc = self.HanLP.parse(text, language='mul')

    def test_tokenize(self):
        print(self.HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司'))
        print(self.HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司', coarse=True))
        print(self.HanLP.tokenize(['商品和服务。', '当下雨天地面积水分外严重']))
        print(self.HanLP.tokenize(
            ['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.',
             '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
             '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'], language='mul'))

    def test_coreference_resolution(self):
        print(self.HanLP.coreference_resolution('我姐送我她的猫。我很喜欢它。'))

    def test_text_style_transfer(self):
        print(self.HanLP.text_style_transfer('国家对中石油抱有很大的期望.', target_style='gov_doc'))
        print(self.HanLP.text_style_transfer('打工人，打工魂，打工都是人上人', target_style='gov_doc'))
        print(self.HanLP.text_style_transfer('我看到了窗户外面有白色的云和绿色的森林', target_style='modern_poetry'))

    def test_abstract_meaning_representation(self):
        print(self.HanLP.abstract_meaning_representation('男孩希望女孩相信他。'))
        print(self.HanLP.abstract_meaning_representation('男孩希望女孩相信他。', visualization='dot'))
        print(self.HanLP.abstract_meaning_representation('男孩希望女孩相信他。', visualization='svg'))
        print(self.HanLP.abstract_meaning_representation(tokens=[['男孩', '希望', '女孩', '相信', '他', '。']]))
        print(self.HanLP.abstract_meaning_representation('The boy wants the girl to believe him.', language='en'))

    def test_keyphrase_extraction(self):
        print(self.HanLP.keyphrase_extraction(
            '自然语言处理是一门博大精深的学科，掌握理论才能发挥出HanLP的全部性能。 '
            '《自然语言处理入门》是一本配套HanLP的NLP入门书，助你零起点上手自然语言处理。', topk=3))

    def test_extractive_summarization(self):
        text = '''
        据DigiTimes报道，在上海疫情趋缓，防疫管控开始放松后，苹果供应商广达正在逐步恢复其中国工厂的MacBook产品生产。
        据供应链消息人士称，生产厂的订单拉动情况正在慢慢转强，这会提高MacBook Pro机型的供应量，并缩短苹果客户在过去几周所经历的延长交货时间。
        仍有许多苹果笔记本用户在等待3月和4月订购的MacBook Pro机型到货，由于苹果的供应问题，他们的发货时间被大大推迟了。
        据分析师郭明錤表示，广达是高端MacBook Pro的唯一供应商，自防疫封控依赖，MacBook Pro大部分型号交货时间增加了三到五周，
        一些高端定制型号的MacBook Pro配置要到6月底到7月初才能交货。
        尽管MacBook Pro的生产逐渐恢复，但供应问题预计依然影响2022年第三季度的产品销售。
        苹果上周表示，防疫措施和元部件短缺将继续使其难以生产足够的产品来满足消费者的强劲需求，这最终将影响苹果6月份的收入。
            '''
        print(self.HanLP.extractive_summarization(text))


if __name__ == '__main__':
    unittest.main()


================================================
FILE: plugins/hanlp_restful_golang/README.md
================================================
# gohanlp

Golang RESTful Client for HanLP

We have moved to https://github.com/hankcs/gohanlp

================================================
FILE: plugins/hanlp_restful_java/pom.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.hankcs.hanlp.restful</groupId>
    <artifactId>hanlp-restful</artifactId>
    <version>0.0.15</version>

    <name>HanLP RESTful Client in Java</name>
    <url>https://github.com/hankcs/HanLP</url>
    <description>
        HanLP: Han Language Processing
    </description>
    <organization>
        <name>hankcs</name>
        <url>http://www.hankcs.com/</url>
    </organization>
    <licenses>
        <license>
            <name>Apache License Version 2.0</name>
            <url>https://www.apache.org/licenses/LICENSE-2.0.html</url>
        </license>
    </licenses>
    <inceptionYear>2020</inceptionYear>
    <developers>
        <developer>
            <name>hankcs</name>
            <email>cnhankmc@gmail.com</email>
            <url>http://www.hankcs.com</url>
        </developer>
    </developers>
    <scm>
        <connection>scm:git@github.com:hankcs/HanLP.git</connection>
        <developerConnection>scm:git@github.com:hankcs/HanLP.git</developerConnection>
        <url>git@github.com:hankcs/HanLP.git</url>
    </scm>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>8</source>
                    <target>8</target>
                </configuration>
            </plugin>
            <plugin>
                <artifactId>maven-source-plugin</artifactId>
                <version>2.4</version>
                <executions>
                    <execution>
                        <id>attach-sources</id>
                        <goals>
                            <goal>jar</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-javadoc-plugin</artifactId>
                <version>2.9.1</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>jar</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-gpg-plugin</artifactId>
                <version>1.6</version>
                <executions>
                    <execution>
                        <phase>verify</phase>
                        <goals>
                            <goal>sign</goal>
                        </goals>
                        <configuration>
                            <!-- This is necessary for gpg to not try to use the pinentry programs -->
                            <gpgArguments>
                                <arg>--pinentry-mode</arg>
                                <arg>loopback</arg>
                            </gpgArguments>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

    <dependencies>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
            <version>2.14.1</version>
        </dependency>
        <dependency>
            <groupId>org.junit.jupiter</groupId>
            <artifactId>junit-jupiter</artifactId>
            <version>RELEASE</version>
            <scope>test</scope>
        </dependency>
    </dependencies>

    <distributionManagement>
        <snapshotRepository>
            <id>maven-repo</id>
            <url>https://oss.sonatype.org/content/repositories/snapshots/</url>
        </snapshotRepository>
        <repository>
            <id>maven-repo</id>
            <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
        </repository>
    </distributionManagement>
</project>

================================================
FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/BaseInput.java
================================================
/*
 * <author>Han He</author>
 * <email>me@hankcs.com</email>
 * <create-date>2020-12-27 12:07 AM</create-date>
 *
 * <copyright file="Input.java">
 * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/
 * See LICENSE file in the project root for full license information.
 * </copyright>
 */
package com.hankcs.hanlp.restful;

/**
 * @author hankcs
 */
public class BaseInput
{
    public String[] tasks;
    public String[] skip_tasks;
    public String language;

    public BaseInput(String[] tasks, String[] skipTasks, String language)
    {
        this.tasks = tasks;
        this.skip_tasks = skipTasks;
        this.language = language;
    }
}


================================================
FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/CoreferenceResolutionOutput.java
================================================
/*
 * <author>Han He</author>
 * <email>me@hankcs.com</email>
 * <create-date>2021-10-16 4:43 PM</create-date>
 *
 * <copyright file="CoreferenceResolutionOutput.java">
 * Copyright (c) 2021, Han He. All Rights Reserved, http://www.hankcs.com/
 * See LICENSE file in the project root for full license information.
 * </copyright>
 */
package com.hankcs.hanlp.restful;

import java.util.ArrayList;
import java.util.List;
import java.util.Set;

/**
 * A data class for coreference resolution
 *
 * @author hankcs
 */
public class CoreferenceResolutionOutput
{
    public List<Set<Span>> clusters;
    public ArrayList<String> tokens;

    public CoreferenceResolutionOutput(List<Set<Span>> clusters, ArrayList<String> tokens)
    {
        this.clusters = clusters;
        this.tokens = tokens;
    }
}


================================================
FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/DocumentInput.java
================================================
/*
 * <author>Han He</author>
 * <email>me@hankcs.com</email>
 * <create-date>2020-12-27 12:09 AM</create-date>
 *
 * <copyright file="TextInput.java">
 * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/
 * See LICENSE file in the project root for full license information.
 * </copyright>
 */
package com.hankcs.hanlp.restful;

/**
 * @author hankcs
 */
public class DocumentInput extends BaseInput
{
    public String text;

    public DocumentInput(String text, String[] tasks, String[] skipTasks, String language)
    {
        super(tasks, skipTasks, language);
        this.text = text;
    }
}


================================================
FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/HanLPClient.java
================================================
/*
 * <author>Han He</author>
 * <email>me@hankcs.com</email>
 * <create-date>2020-12-26 11:54 PM</create-date>
 *
 * <copyright file="HanLPClient.java">
 * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/
 * See LICENSE file in the project root for full license information.
 * </copyright>
 */
package com.hankcs.hanlp.restful;


import com.fasterxml.jackson.databind.ObjectMapper;
import com.hankcs.hanlp.restful.mrp.MeaningRepresentation;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.*;

/**
 * A RESTful client implementing the data format specification of HanLP.
 *
 * @author hankcs
 * @see <a href="https://hanlp.hankcs.com/docs/data_format.html">Data Format</a>
 */
public class HanLPClient
{
    private String url;
    private String auth;
    private String language;
    private int timeout;
    private ObjectMapper mapper;

    /**
     * @param url      An API endpoint to a service provider.
     * @param auth     An auth key licenced by a service provider.
     * @param language The language this client will be expecting. Contact the service provider for the list of
     *                 languages supported. Conventionally, zh is used for Chinese and mul for multilingual.
     *                 Leave null to use the default language on server.
     * @param timeout  Maximum waiting time in seconds for a request.
     */
    public HanLPClient(String url, String auth, String language, int timeout)
    {
        if (auth == null)
        {
            auth = System.getenv().getOrDefault("HANLP_AUTH", null);
        }
        this.url = url;
        this.auth = auth;
        this.language = language;
        this.timeout = timeout * 1000;
        this.mapper = new ObjectMapper();
    }

    /**
     * @param url  An API endpoint to a service provider.
     * @param auth An auth key licenced by a service provider.
     */
    public HanLPClient(String url, String auth)
    {
        this(url, auth, null, 5);
    }

    /**
     * Parse a raw document.
     *
     * @param text      Document content which can have multiple sentences.
     * @param tasks     Tasks to perform.
     * @param skipTasks Tasks to skip.
     * @return Parsed annotations.
     * @throws IOException HTTP exception.
     * @see <a href="https://hanlp.hankcs.com/docs/data_format.html">Data Format</a>
     */
    public Map<String, List> parse(String text, String[] tasks, String[] skipTasks) throws IOException
    {
        //noinspection unchecked
        return mapper.readValue(post("/parse", new DocumentInput(text, tasks, skipTasks, language)), Map.class);
    }

    /**
     * Parse a raw document.
     *
     * @param text Document content which can have multiple sentences.
     * @return Parsed annotations.
     * @throws IOException HTTP exception.
     * @see <a href="https://hanlp.hankcs.com/docs/data_format.html">Data Format</a>
     */
    public Map<String, List> parse(String text) throws IOException
    {
        return parse(text, null, null);
    }

    /**
     * Parse an array of sentences.
     *
     * @param sentences Multiple sentences to parse.
     * @param tasks     Tasks to perform.
     * @param skipTasks Tasks to skip.
     * @return Parsed annotations.
     * @throws IOException HTTP exception.
     * @see <a href="https://hanlp.hankcs.com/docs/data_format.html">Data Format</a>
     */
    public Map<String, List> parse(String[] sentences, String[] tasks, String[] skipTasks) throws IOException
    {
        //noinspection unchecked
        return mapper.readValue(post("/parse", new SentenceInput(sentences, tasks, skipTasks, language)), Map.class);
    }

    /**
     * Parse an array of sentences.
     *
     * @param sentences Multiple sentences to parse.
     * @return Parsed annotations.
     * @throws IOException HTTP exception.
     * @see <a href="https://hanlp.hankcs.com/docs/data_format.html">Data Format</a>
     */
    public Map<String, List> parse(String[] sentences) throws IOException
    {
        return parse(sentences, null, null);
    }

    /**
     * Parse an array of pre-tokenized sentences.
     *
     * @param tokens    Multiple pre-tokenized sentences to parse.
     * @param tasks     Tasks to perform.
     * @param skipTasks Tasks to skip.
     * @return Parsed annotations.
     * @throws IOException HTTP exception.
     * @see <a href="https://hanlp.hankcs.com/docs/data_format.html">Data Format</a>
     */
    public Map<String, List> parse(String[][] tokens, String[] tasks, String[] skipTasks) throws IOException
    {
        //noinspection unchecked
        return mapper.readValue(post("/parse", new TokenInput(tokens, tasks, skipTasks, language)), Map.class);
    }

    /**
     * Parse an array of pre-tokenized sentences.
     *
     * @param tokens Multiple pre-tokenized sentences to parse.
     * @return Parsed annotations.
     * @throws IOException HTTP exception.
     * @see <a href="https://hanlp.hankcs.com/docs/data_format.html">Data Format</a>
     */
    public Map<String, List> parse(String[][] tokens) throws IOException
    {
        return parse(tokens, null, null);
    }

    /**
     * Split a document into sentences and tokenize them.
     *
     * @param text   A document.
     * @param coarse Whether to perform coarse-grained or fine-grained tokenization.
     * @return A list of tokenized sentences.
     * @throws IOException HTTP exception.
     */
    public List<List<String>> tokenize(String text, Boolean coarse) throws IOException
    {
        String[] tasks;
        if (coarse != null)
        {
            if (coarse)
                tasks = new String[]{"tok/coarse"};
            else
                tasks = new String[]{"tok/fine"};
        }
        else
            tasks = new String[]{"tok"};
        Map<String, List> doc = parse(text, tasks, null);
        //noinspection unchecked
        return doc.values().iterator().next();
    }

    /**
     * Split a document into sentences and tokenize them using fine-grained standard.
     *
     * @param text A document.
     * @return A list of tokenized sentences.
     * @throws IOException HTTP exception.
     */
    public List<List<String>> tokenize(String text) throws IOException
    {
        return tokenize(text, null);
    }

    /**
     * Text style transfer aims to change the style of the input text to the target style while preserving its content.
     *
     * @param text        Source text.
     * @param targetStyle Target style.
     * @return Text of the target style.
     */
    public List<String> textStyleTransfer(List<String> text, String targetStyle) throws IOException
    {
        Map<String, Object> input = new HashMap<>();
        input.put("text", text);
        input.put("target_style", targetStyle);
        input.put("language", language);
        //noinspection unchecked
        return mapper.readValue(post("/text_style_transfer", input), List.class);
    }

    /**
     * Text style transfer aims to change the style of the input text to the target style while preserving its content.
     *
     * @param text        Source text.
     * @param targetStyle Target style.
     * @return Text of the target style.
     */
    public String textStyleTransfer(String text, String targetStyle) throws IOException
    {
        Map<String, Object> input = new HashMap<>();
        input.put("text", text);
        input.put("target_style", targetStyle);
        input.put("language", language);
        return mapper.readValue(post("/text_style_transfer", input), String.class);
    }

    /**
     * Grammatical Error Correction (GEC) is the task of correcting different kinds of errors in text such as
     * spelling, punctuation, grammatical, and word choice errors.
     *
     * @param text Text potentially containing different kinds of errors such as spelling, punctuation,
     *             grammatical, and word choice errors.
     * @return Corrected text.
     */
    public List<String> grammaticalErrorCorrection(List<String> text) throws IOException
    {
        Map<String, Object> input = new HashMap<>();
        input.put("text", text);
        input.put("language", language);
        //noinspection unchecked
        return mapper.readValue(post("/grammatical_error_correction", input), List.class);
    }

    /**
     * Grammatical Error Correction (GEC) is the task of correcting different kinds of errors in text such as
     * spelling, punctuation, grammatical, and word choice errors.
     *
     * @param text Text potentially containing different kinds of errors such as spelling, punctuation,
     *             grammatical, and word choice errors.
     * @return Corrected text.
     */
    public String[] grammaticalErrorCorrection(String[] text) throws IOException
    {
        Map<String, Object> input = new HashMap<>();
        input.put("text", text);
        input.put("language", language);
        //noinspection unchecked
        return mapper.readValue(post("/grammatical_error_correction", input), String[].class);
    }

    /**
     * Grammatical Error Correction (GEC) is the task of correcting different kinds of errors in text such as
     * spelling, punctuation, grammatical, and word choice errors.
     *
     * @param text Text potentially containing different kinds of errors such as spelling, punctuation,
     *             grammatical, and word choice errors.
     * @return Corrected text.
     */
    public String grammaticalErrorCorrection(String text) throws IOException
    {
        Map<String, Object> input = new HashMap<>();
        input.put("text", text);
        input.put("language", language);
        return mapper.readValue(post("/grammatical_error_correction", input), String.class);
    }

    /**
     * Semantic textual similarity deals with determining how similar two pieces of texts are.
     *
     * @param textA The first text.
     * @param textB The second text.
     * @return Their similarity.
     * @throws IOException HTTP errors.
     */
    public Double semanticTextualSimilarity(String textA, String textB) throws IOException
    {
        Map<String, Object> input = new HashMap<>();
        input.put("text", new String[]{textA, textB});
        input.put("language", language);
        return mapper.readValue(post("/semantic_textual_similarity", input), Double.class);
    }

    /**
     * Semantic textual similarity deals with determining how similar two pieces of texts are.
     *
     * @param text The pairs of text.
     * @return Their similarities.
     * @throws IOException HTTP errors.
     */
    public List<Double> semanticTextualSimilarity(String[][] text) throws IOException
    {
        Map<String, Object> input = new HashMap<>();
        input.put("text", text);
        input.put("language", language);
        //noinspection unchecked
        return mapper.readValue(post("/semantic_textual_similarity", input), List.class);
    }

    /**
     * Coreference resolution is the task of clustering mentions in text that refer to the same underlying real world entities.
     *
     * @param text A piece of text, usually a document without tokenization.
     * @return Coreference resolution clusters and tokens.
     * @throws IOException HTTP errors.
     */
    public CoreferenceResolutionOutput coreferenceResolution(String text) throws IOException
    {
        Map<String, Object> input = new HashMap<>();
        input.put("text", text);
        input.put("language", language);
        //noinspection unchecked
        Map<String, List> response = mapper.readValue(post("/coreference_resolution", input), Map.class);
        //noinspection unchecked
        List<List<List>> clusters = response.get("clusters");
        return new CoreferenceResolutionOutput(_convert_clusters(clusters), (ArrayList<String>) response.get("tokens"));
    }

    /**
     * Coreference resolution is the task of clustering mentions in text that refer to the same underlying real world entities.
     *
     * @param tokens   A list of sentences where each sentence is a list of tokens.
     * @param speakers A list of speakers where each speaker is a String representing the speaker's ID, e.g., "Tom".
     * @return Coreference resolution clusters.
     * @throws IOException HTTP errors.
     */
    public List<Set<Span>> coreferenceResolution(String[][] tokens, String[] speakers) throws IOException
    {
        Map<String, Object> input = new HashMap<>();
        input.put("tokens", tokens);
        input.put("speakers", speakers);
        input.put("language", language);
        //noinspection unchecked
        List<List<List>> clusters = mapper.readValue(post("/coreference_resolution", input), List.class);
        return _convert_clusters(clusters);
    }

    /**
     * Coreference resolution is the task of clustering mentions in text that refer to the same underlying real world entities.
     *
     * @param tokens A list of sentences where each sentence is a list of tokens.
     * @return Coreference resolution clusters.
     * @throws IOException HTTP errors.
     */
    public List<Set<Span>> coreferenceResolution(String[][] tokens) throws IOException
    {
        Map<String, Object> input = new HashMap<>();
        input.put("tokens", tokens);
        input.put("language", language);
        //noinspection unchecked
        List<List<List>> clusters = mapper.readValue(post("/coreference_resolution", input), List.class);
        return _convert_clusters(clusters);
    }

    private static List<Set<Span>> _convert_clusters(List<List<List>> clusters)
    {
        List<Set<Span>> results = new ArrayList<>(clusters.size());
        for (List<List> cluster : clusters)
        {
            Set<Span> spans = new LinkedHashSet<>();
            for (List span : cluster)
            {
                spans.add(new Span((String) span.get(0), (Integer) span.get(1), (Integer) span.get(2)));
            }
            results.add(spans);
        }
        return results;
    }

    /**
     * Abstract Meaning Representation (AMR) captures “who is doing what to whom” in a sentence. Each sentence is
     * represented as a rooted, directed, acyclic graph consisting of nodes (concepts) and edges (relations).
     *
     * @param text A piece of text, usually a document without tokenization.
     * @return AMR graphs.
     * @throws IOException HTTP errors.
     */
    public MeaningRepresentation[] abstractMeaningRepresentation(String text) throws IOException
    {
        Map<String, Object> input = new HashMap<>();
        input.put("text", text);
        input.put("language", language);
        return mapper.readValue(post("/abstract_meaning_representation", input), MeaningRepresentation[].class);
    }

    /**
     * Abstract Meaning Representation (AMR) captures “who is doing what to whom” in a sentence. Each sentence is
     * represented as a rooted, directed, acyclic graph consisting of nodes (concepts) and edges (relations).
     *
     * @param tokens A list of sentences where each sentence is a list of tokens.
     * @return AMR graphs.
     * @throws IOException HTTP errors.
     */
    public MeaningRepresentation[] abstractMeaningRepresentation(String[][] tokens) throws IOException
    {
        Map<String, Object> input = new HashMap<>();
        input.put("tokens", tokens);
        input.put("language", language);
        return mapper.readValue(post("/abstract_meaning_representation", input), MeaningRepresentation[].class);
    }

    /**
     * Keyphrase extraction aims to identify keywords or phrases reflecting the main topics of a document.
     *
     * @param text The text content of the document. Preferably the concatenation of the title and the content.
     * @param topk The number of top-K ranked keywords or keyphrases.
     * @return A dictionary containing each keyphrase and its ranking score s between 0 and 1.
     * @throws IOException HTTP errors.
     */
    public Map<String, Double> keyphraseExtraction(String text, int topk) throws IOException
    {
        Map<String, Object> input = new HashMap<>();
        input.put("text", text);
        input.put("topk", topk);
        input.put("language", language);
        //noinspection unchecked
        return mapper.readValue(post("/keyphrase_extraction", input), LinkedHashMap.class);
    }

    /**
     * Single document summarization is the task of selecting a subset of the sentences which best
     * represents a summary of the document, with a balance of salience and redundancy.
     *
     * @param text The text content of the document.
     * @return A dictionary containing each sentence and its ranking score s between 0 and 1.
     * @throws IOException HTTP errors.
     */
    public Map<String, Double> extractiveSummarization(String text) throws IOException
    {
        return extractiveSummarization(text, 3);
    }

    /**
     * Single document summarization is the task of selecting a subset of the sentences which best
     * represents a summary of the document, with a balance of salience and redundancy.
     *
     * @param text The text content of the document.
     * @param topk The maximum number of top-K ranked sentences. Note that due to Trigram Blocking tricks, the actual
     *             number of returned sentences could be less than ``topk``.
     * @return A dictionary containing each sentence and its ranking score s between 0 and 1.
     * @throws IOException HTTP errors.
     */
    public Map<String, Double> extractiveSummarization(String text, int topk) throws IOException
    {
        Map<String, Object> input = new HashMap<>();
        input.put("text", text);
        input.put("topk", topk);
        input.put("language", language);
        //noinspection unchecked
        return mapper.readValue(post("/extractive_summarization", input), LinkedHashMap.class);
    }

    /**
     * Abstractive Summarization is the task of generating a short and concise summary that captures the
     * salient ideas of the source text. The generated summaries potentially contain new phrases and sentences that
     * may not appear in the source text.
     *
     * @param text The text content of the document.
     * @return Summarization.
     * @throws IOException HTTP errors.
     */
    public String abstractiveSummarization(String text) throws IOException
    {
        Map<String, Object> input = new HashMap<>();
        input.put("text", text);
        input.put("language", language);
        //noinspection unchecked
        return mapper.readValue(post("/abstractive_summarization", input), String.class);
    }

    /**
     * Text classification is the task of assigning a sentence or document an appropriate category.
     * The categories depend on the chosen dataset and can range from topics.
     *
     * @param text  The text content of the document.
     * @param model The model to use for prediction.
     * @return Classification results.
     * @throws IOException HTTP errors.
     */
    public String textClassification(String text, String model) throws IOException
    {
        return (String) textClassification(text, model, false, false);
    }


    /**
     * Sentiment analysis is the task of classifying the polarity of a given text. For instance,
     * a text-based tweet can be categorized into either "positive", "negative", or "neutral".
     *
     * @param text The text content of the document.
     * @return Sentiment polarity as a numerical value which measures how positive the sentiment is.
     * @throws IOException HTTP errors.
     */
    public Double sentimentAnalysis(String text) throws IOException
    {
        Map<String, Object> input = new HashMap<>();
        input.put("text", text);
        input.put("language", language);
        //noinspection unchecked
        return mapper.readValue(post("/sentiment_analysis", input), Double.class);
    }


    /**
     * Text classification is the task of assigning a sentence or document an appropriate category.
     * The categories depend on the chosen dataset and can range from topics.
     *
     * @param text  A document or a list of documents.
     * @param model The model to use for prediction.
     * @param topk  `true` or `int` to return the top-k languages.
     * @param prob  Return also probabilities.
     * @return Classification results.
     * @throws IOException HTTP errors.
     */
    public Object textClassification(Object text, String model, Object topk, boolean prob) throws IOException
    {
        Map<String, Object> input = new HashMap<>();
        input.put("text", text);
        input.put("model", model);
        input.put("topk", topk);
        input.put("prob", prob);
        //noinspection unchecked
        return mapper.readValue(post("/text_classification", input), Object.class);
    }

    /**
     * Recognize the language of a given text.
     *
     * @param text The text content of the document.
     * @return Identified language in <a href="https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes">ISO 639-1 codes</a>.
     * @throws IOException HTTP errors.
     */
    public String languageIdentification(String text) throws IOException
    {
        return textClassification(text, "lid");
    }

    /**
     * Recognize the language of a given text.
     *
     * @param text The text content of the document.
     * @return Identified language in <a href="https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes">ISO 639-1 codes</a>.
     * @throws IOException HTTP errors.
     */
    public List<String> languageIdentification(String[] text) throws IOException
    {
        return (List<String>) textClassification(text, "lid", false, false);
    }

    /**
     * Keyphrase extraction aims to identify keywords or phrases reflecting the main topics of a document.
     *
     * @param text The text content of the document. Preferably the concatenation of the title and the content.
     * @return A dictionary containing 10 keyphrases and their ranking scores s between 0 and 1.
     * @throws IOException HTTP errors.
     */
    public Map<String, Double> keyphraseExtraction(String text) throws IOException
    {
        return keyphraseExtraction(text, 10);
    }

    private String post(String api, Object input_) throws IOException
    {
        URL url = new URL(this.url + api);

        HttpURLConnection con = (HttpURLConnection) url.openConnection();
        con.setRequestMethod("POST");
        if (auth != null)
            con.setRequestProperty("Authorization", "Basic " + auth);
        con.setRequestProperty("Content-Type", "application/json; utf-8");
        con.setRequestProperty("Accept", "application/json");
        con.setDoOutput(true);
        con.setConnectTimeout(timeout);
        con.setReadTimeout(timeout);

        String jsonInputString = mapper.writeValueAsString(input_);

        try (OutputStream os = con.getOutputStream())
        {
            byte[] input = jsonInputString.getBytes(StandardCharsets.UTF_8);
            os.write(input, 0, input.length);
        }

        int code = con.getResponseCode();
        if (code != 200)
        {
            StringBuilder response = new StringBuilder();
            try (BufferedReader br = new BufferedReader(new InputStreamReader(con.getErrorStream(), StandardCharsets.UTF_8)))
            {
                String responseLine;
                while ((responseLine = br.readLine()) != null)
                {
                    response.append(responseLine.trim());
                }
            }
            String error = String.format("Request failed, status code = %d, error = %s", code, con.getResponseMessage());
            try
            {
                Map detail = mapper.readValue(response.toString(), Map.class);
                error = (String) detail.get("detail");
            }
            catch (Exception ignored)
            {
            }
            throw new IOException(error);
        }

        StringBuilder response = new StringBuilder();
        try (BufferedReader br = new BufferedReader(new InputStreamReader(con.getInputStream(), StandardCharsets.UTF_8)))
        {
            String responseLine;
            while ((responseLine = br.readLine()) != null)
            {
                response.append(responseLine.trim());
            }
        }
        return response.toString();
    }

}


================================================
FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/SentenceInput.java
================================================
/*
 * <author>Han He</author>
 * <email>me@hankcs.com</email>
 * <create-date>2020-12-27 12:09 AM</create-date>
 *
 * <copyright file="SentenceInput.java">
 * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/
 * See LICENSE file in the project root for full license information.
 * </copyright>
 */
package com.hankcs.hanlp.restful;

/**
 * @author hankcs
 */
public class SentenceInput extends BaseInput
{
    public String[] text;

    public SentenceInput(String[] text, String[] tasks, String[] skipTasks, String language)
    {
        super(tasks, skipTasks, language);
        this.text = text;
    }
}


================================================
FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/Span.java
================================================
/*
 * <author>Han He</author>
 * <email>me@hankcs.com</email>
 * <create-date>2021-10-16 4:26 PM</create-date>
 *
 * <copyright file="Span.java">
 * Copyright (c) 2021, Han He. All Rights Reserved, http://www.hankcs.com/
 * See LICENSE file in the project root for full license information.
 * </copyright>
 */
package com.hankcs.hanlp.restful;

import java.util.Objects;

/**
 * A common data format to represent a span.
 *
 * @author hankcs
 */
public class Span
{
    /**
     * The raw form of a span, which can be either a token, an entity or a mention etc.
     */
    public String form;
    /**
     * The inclusive beginning offset of a span.
     */
    public int begin;
    /**
     * The exclusive ending offset of a span.
     */
    public int end;

    public Span(String form, int begin, int end)
    {
        this.form = form;
        this.begin = begin;
        this.end = end;
    }

    @Override
    public boolean equals(Object o)
    {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        Span span = (Span) o;
        return begin == span.begin &&
                end == span.end &&
                form.equals(span.form);
    }

    @Override
    public int hashCode()
    {
        return Objects.hash(form, begin, end);
    }

    @Override
    public String toString()
    {
        return String.format("[%d, %d) = %s", begin, end, form);
    }
}


================================================
FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/TokenInput.java
================================================
/*
 * <author>Han He</author>
 * <email>me@hankcs.com</email>
 * <create-date>2020-12-27 12:09 AM</create-date>
 *
 * <copyright file="TokenInput.java">
 * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/
 * See LICENSE file in the project root for full license information.
 * </copyright>
 */
package com.hankcs.hanlp.restful;

/**
 * @author hankcs
 */
public class TokenInput extends BaseInput
{
    public String[][] tokens;

    public TokenInput(String[][] tokens, String[] tasks, String[] skipTasks, String language)
    {
        super(tasks, skipTasks, language);
        this.tokens = tokens;
    }
}


================================================
FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/mrp/Anchor.java
================================================
/*
 * <author>Han He</author>
 * <email>me@hankcs.com</email>
 * <create-date>2022-04-13 8:58 AM</create-date>
 *
 * <copyright file="Anchor.java">
 * Copyright (c) 2022, Han He. All Rights Reserved, http://www.hankcs.com/
 * See LICENSE file in the project root for full license information.
 * </copyright>
 */
package com.hankcs.hanlp.restful.mrp;

/**
 * @author hankcs
 */
public class Anchor
{
    public String from;
    public String to;
}


================================================
FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/mrp/Edge.java
================================================
/*
 * <author>Han He</author>
 * <email>me@hankcs.com</email>
 * <create-date>2022-04-13 9:01 AM</create-date>
 *
 * <copyright file="Edge.java">
 * Copyright (c) 2022, Han He. All Rights Reserved, http://www.hankcs.com/
 * See LICENSE file in the project root for full license information.
 * </copyright>
 */
package com.hankcs.hanlp.restful.mrp;

/**
 * @author hankcs
 */
public class Edge
{
    public int source;
    public int target;
    public String label;
}


================================================
FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/mrp/MeaningRepresentation.java
================================================
/*
 * <author>Han He</author>
 * <email>me@hankcs.com</email>
 * <create-date>2022-04-13 8:57 AM</create-date>
 *
 * <copyright file="MeaningRepresentation.java">
 * Copyright (c) 2022, Han He. All Rights Reserved, http://www.hankcs.com/
 * See LICENSE file in the project root for full license information.
 * </copyright>
 */
package com.hankcs.hanlp.restful.mrp;

/**
 * Graph-based meaning representation.
 *
 * @author hankcs
 */
public class MeaningRepresentation
{
    public String id;
    public String input;
    public Node[] nodes;
    public Edge[] edges;
    public String[] tops;
    public String framework;
}


================================================
FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/mrp/Node.java
================================================
/*
 * <author>Han He</author>
 * <email>me@hankcs.com</email>
 * <create-date>2022-04-13 8:57 AM</create-date>
 *
 * <copyright file="Node.java">
 * Copyright (c) 2022, Han He. All Rights Reserved, http://www.hankcs.com/
 * See LICENSE file in the project root for full license information.
 * </copyright>
 */
package com.hankcs.hanlp.restful.mrp;

/**
 * @author hankcs
 */
public class Node
{
    public int id;
    public String label;
    public String[] properties;
    public String[] values;
    public Anchor[] anchors;
}


================================================
FILE: plugins/hanlp_restful_java/src/test/java/com/hankcs/hanlp/restful/HanLPClientTest.java
================================================
package com.hankcs.hanlp.restful;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;

class HanLPClientTest
{
    HanLPClient client;

    @BeforeEach
    void setUp()
    {
        client = new HanLPClient("https://hanlp.hankcs.com/api", null);
    }

    @org.junit.jupiter.api.Test
    void parseText() throws IOException
    {
        Map<String, List> doc = client.parse("2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。英首相与特朗普通电话讨论华为与苹果公司。");
        prettyPrint(doc);
    }

    @org.junit.jupiter.api.Test
    void parseSentences() throws IOException
    {
        Map<String, List> doc = client.parse(new String[]{
                "2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。",
                "英首相与特朗普通电话讨论华为与苹果公司。"
        });
        prettyPrint(doc);
    }

    @org.junit.jupiter.api.Test
    void parseTokens() throws IOException
    {
        Map<String, List> doc = client.parse(new String[][]{
                new String[]{"2021年", "HanLPv2.1", "为", "生产", "环境", "带来", "次", "世代", "最", "先进", "的", "多语种", "NLP", "技术", "。"},
                new String[]{"英", "首相", "与", "特朗普", "通", "电话", "讨论", "华为", "与", "苹果", "公司", "。"},
        });
        prettyPrint(doc);
    }

    @Test
    void parseCoarse() throws IOException
    {
        Map<String, List> doc = client.parse(
                "阿婆主来到北京立方庭参观自然语义科技公司。",
                new String[]{"tok/coarse", "pos", "dep"},
                new String[]{"tok/fine"});
        prettyPrint(doc);
    }

    @Test
    void tokenize() throws IOException
    {
        List<List<String>> fine = client.tokenize("2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。");
        System.out.println(fine);
        List<List<String>> coarse = client.tokenize("2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。", true);
        System.out.println(coarse);
    }

    @Test
    void textStyleTransfer() throws IOException
    {
        String doc = client.textStyleTransfer("国家对中石油抱有很大的期望.", "gov_doc");
        prettyPrint(doc);
    }

    @Test
    void semanticTextualSimilarity() throws IOException
    {
        Double similarity = client.semanticTextualSimilarity("看图猜一电影名", "看图猜电影");
        prettyPrint(similarity);
        List<Double> similarities = client.semanticTextualSimilarity(new String[][]{
                new String[]{"看图猜一电影名", "看图猜电影"},
                new String[]{"北京到上海的动车票", "上海到北京的动车票"}
        });
        for (Double similarityPerPair : similarities)
        {
            prettyPrint(similarityPerPair);
        }
    }

    @Test
    void coreferenceResolutionText() throws IOException
    {
        CoreferenceResolutionOutput clusters = client.coreferenceResolution("我姐送我她的猫。我很喜欢它。");
        prettyPrint(clusters);
    }

    @Test
    void coreferenceResolutionTokens() throws IOException
    {
        List<Set<Span>> clusters = client.coreferenceResolution(
                new String[][]{
                        new String[]{"我", "姐", "送", "我", "她", "的", "猫", "。"},
                        new String[]{"我", "很", "喜欢", "它", "。"}});
        prettyPrint(clusters);
    }

    @Test
    void coreferenceResolutionTokensWithSpeakers() throws IOException
    {
        List<Set<Span>> clusters = client.coreferenceResolution(
                new String[][]{
                        new String[]{"我", "姐", "送", "我", "她", "的", "猫", "。"},
                        new String[]{"我", "很", "喜欢", "它", "。"}},
                new String[]{"张三", "张三"});
        prettyPrint(clusters);
    }

    @Test
    void keyphraseExtraction() throws IOException
    {
        prettyPrint(client.keyphraseExtraction(
                "自然语言处理是一门博大精深的学科，掌握理论才能发挥出HanLP的全部性能。" +
                        "《自然语言处理入门》是一本配套HanLP的NLP入门书，助你零起点上手自然语言处理。", 3));
    }

    @Test
    void extractiveSummarization() throws IOException
    {
        prettyPrint(client.extractiveSummarization(
                "据DigiTimes报道，在上海疫情趋缓，防疫管控开始放松后，苹果供应商广达正在逐步恢复其中国工厂的MacBook产品生产。\n" +
                        "据供应链消息人士称，生产厂的订单拉动情况正在慢慢转强，这会提高MacBook Pro机型的供应量，并缩短苹果客户在过去几周所经历的延长交货时间。\n" +
                        "仍有许多苹果笔记本用户在等待3月和4月订购的MacBook Pro机型到货，由于苹果的供应问题，他们的发货时间被大大推迟了。\n" +
                        "据分析师郭明錤表示，广达是高端MacBook Pro的唯一供应商，自防疫封控依赖，MacBook Pro大部分型号交货时间增加了三到五周，\n" +
                        "一些高端定制型号的MacBook Pro配置要到6月底到7月初才能交货。\n" +
                        "尽管MacBook Pro的生产逐渐恢复，但供应问题预计依然影响2022年第三季度的产品销售。\n" +
                        "苹果上周表示，防疫措施和元部件短缺将继续使其难以生产足够的产品来满足消费者的强劲需求，这最终将影响苹果6月份的收入。"));
    }

    @Test
    void abstractiveSummarization() throws IOException
    {
        prettyPrint(client.abstractiveSummarization(
                "每经AI快讯，2月4日，长江证券研究所金属行业首席分析师王鹤涛表示，2023年海外经济衰退，美债现处于历史高位，\n" +
                        "黄金的趋势是值得关注的；在国内需求修复的过程中，看好大金属品种中的铜铝钢。\n" +
                        "此外，在细分的小品种里，建议关注两条主线，一是新能源，比如锂、钴、镍、稀土，二是专精特新主线。（央视财经）"));
    }

    @Test
    void abstractMeaningRepresentationText() throws IOException
    {
        prettyPrint(client.abstractMeaningRepresentation("男孩希望女孩相信他。阿婆主来到北京立方庭参观自然语义科技公司。"));
    }

    @Test
    void abstractMeaningRepresentationTokens() throws IOException
    {
        prettyPrint(client.abstractMeaningRepresentation(new String[][]{
                new String[]{"2021年", "HanLPv2.1", "为", "生产", "环境", "带来", "次", "世代", "最", "先进", "的", "多语种", "NLP", "技术", "。"},
                new String[]{"英", "首相", "与", "特朗普", "通", "电话", "讨论", "华为", "与", "苹果", "公司", "。"}}));
    }

    @Test
    void grammaticalErrorCorrection() throws IOException
    {
        prettyPrint(client.grammaticalErrorCorrection(new String[]{"每个青年都应当有远大的报复。", "有的同学对语言很兴趣。"}));
    }

    @Test
    void languageIdentification() throws IOException
    {
        prettyPrint(client.languageIdentification(new String[]{
                "In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.",
                "2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。",
                "2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。",
        }));
    }

    @Test
    void sentimentAnalysis() throws IOException
    {
        prettyPrint(client.sentimentAnalysis(
                "“这是一部男人必看的电影。”人人都这么说。但单纯从性别区分，就会让这电影变狭隘。《肖申克的救赎》突破了男人电影的局限，通篇几乎充满令人难以置信的温馨基调，而电影里最伟大的主题是“希望”。 当我们无奈地遇到了如同肖申克一般囚禁了心灵自由的那种囹圄，我们是无奈的老布鲁克，灰心的瑞德，还是智慧的安迪？运用智慧，信任希望，并且勇敢面对恐惧心理，去打败它？ 经典的电影之所以经典，因为他们都在做同一件事——让你从不同的角度来欣赏希望的美好。"
        ));
    }

    void prettyPrint(Object object) throws JsonProcessingException
    {
        ObjectMapper mapper = new ObjectMapper();
        System.out.println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(object));
    }
}

================================================
FILE: plugins/hanlp_restful_java/src/test/java/com/hankcs/hanlp/restful/MeaningRepresentationTest.java
================================================
package com.hankcs.hanlp.restful;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.hankcs.hanlp.restful.mrp.MeaningRepresentation;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;

class MeaningRepresentationTest
{


    @Test
    void parseText() throws IOException
    {
        String json = "[{\"id\": \"0\", \"input\": \"北京 大学 计算 语言学 研究所 和 富士通 研究 开发 中心 有限公司 ， 得到 了 人民日报社 新闻 信息 中心 的 语料库 。\", \"nodes\": [{\"id\": 0, \"label\": \"name\", \"properties\": [\"op1\", \"op2\"], \"values\": [\"北京\", \"大学\"], \"anchors\": [{\"from\": 0, \"to\": 2}, {\"from\": 3, \"to\": 5}]}, {\"id\": 1, \"label\": \"university\", \"anchors\": []}, {\"id\": 2, \"label\": \"name\", \"properties\": [\"op1\", \"op2\", \"op4\"], \"values\": [\"计算\", \"语言学\", \"\"], \"anchors\": [{\"from\": 6, \"to\": 8}, {\"from\": 9, \"to\": 12}, {\"from\": 13, \"to\": 16}]}, {\"id\": 3, \"label\": \"research-institute\", \"anchors\": []}, {\"id\": 4, \"label\": \"and\", \"anchors\": []}, {\"id\": 5, \"label\": \"name\", \"properties\": [\"op1\", \"op2\", \"op3\", \"op4\", \"op5\"], \"values\": [\"富士通\", \"研究\", \"开发\", \"中心\", \"有限公司\"], \"anchors\": [{\"from\": 19, \"to\": 22}, {\"from\": 23, \"to\": 25}, {\"from\": 26, \"to\": 28}, {\"from\": 29, \"to\": 31}, {\"from\": 32, \"to\": 36}]}, {\"id\": 6, \"label\": \"company\", \"anchors\": []}, {\"id\": 7, \"label\": \"得到-01\", \"anchors\": [{\"from\": 39, \"to\": 41}]}, {\"id\": 8, \"label\": \"了\", \"anchors\": [{\"from\": 42, \"to\": 43}]}, {\"id\": 9, \"label\": \"name\", \"properties\": [\"op1\"], \"values\": [\"人民日报社\"], \"anchors\": [{\"from\": 44, \"to\": 49}]}, {\"id\": 10, \"label\": \"organization\", \"anchors\": []}, {\"id\": 11, \"label\": \"name\", \"properties\": [\"op1\", \"op2\", \"op3\"], \"values\": [\"新闻\", \"信息\", \"中心\"], \"anchors\": [{\"from\": 50, \"to\": 52}, {\"from\": 53, \"to\": 55}, {\"from\": 56, \"to\": 58}]}, {\"id\": 12, \"label\": \"organization\", \"anchors\": []}, {\"id\": 13, \"label\": \"语料库\", \"anchors\": [{\"from\": 61, \"to\": 64}]}], \"edges\": [{\"source\": 7, \"target\": 8, \"label\": \"aspect\"}, {\"source\": 7, \"target\": 4, \"label\": \"arg0\"}, {\"source\": 10, \"target\": 9, \"label\": \"name\"}, {\"source\": 4, \"target\": 6, \"label\": \"op2\"}, {\"source\": 7, \"target\": 13, \"label\": \"arg1\"}, {\"source\": 6, \"target\": 5, \"label\": \"name\"}, {\"source\": 12, \"target\": 11, \"label\": \"name\"}, {\"source\": 3, \"target\": 2, \"label\": \"name\"}, {\"source\": 1, \"target\": 0, \"label\": \"name\"}, {\"source\": 13, \"target\": 12, \"label\": \"poss\"}, {\"source\": 4, \"target\": 3, \"label\": \"op1\"}, {\"source\": 12, \"target\": 9, \"label\": \"name\"}, {\"source\": 1, \"target\": 3, \"label\": \"part\"}], \"tops\": [7], \"framework\": \"amr\"}]";
        ObjectMapper mapper = new ObjectMapper();
        MeaningRepresentation[] graphs = mapper.readValue(json, MeaningRepresentation[].class);
        prettyPrint(graphs);
    }


    void prettyPrint(Object object) throws JsonProcessingException
    {
        ObjectMapper mapper = new ObjectMapper();
        System.out.println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(object));
    }
}

================================================
FILE: plugins/hanlp_trie/README.md
================================================
# Trie interface and implementation for HanLP

[中文](https://github.com/hankcs/HanLP/tree/doc-zh) | [1.x](https://github.com/hankcs/HanLP/tree/1.x) | [forum](https://bbs.hankcs.com/) | [docker](https://github.com/WalterInSH/hanlp-jupyter-docker)

The multilingual NLP library for researchers and companies, built on PyTorch and TensorFlow 2.x, for advancing state-of-the-art deep learning techniques in both academia and industry. HanLP was designed from day one to be efficient, user friendly and extendable. It comes with pretrained models for various human languages including English, Chinese and many others. Currently, HanLP 2.0 is in alpha stage with more killer features on the roadmap. Discussions are welcomed on our [forum](https://bbs.hankcs.com/), while bug reports and feature requests are reserved for GitHub issues. For Java users, please checkout the [1.x](https://github.com/hankcs/HanLP/tree/1.x) branch.

## Installation

```bash
pip install hanlp
```


## License

HanLP is licensed under **Apache License 2.0**. You can use HanLP in your commercial products for free. We would appreciate it if you add a link to HanLP on your website.


================================================
FILE: plugins/hanlp_trie/hanlp_trie/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-29 17:48
from .trie import Trie
from .dictionary import DictInterface, TrieDict


================================================
FILE: plugins/hanlp_trie/hanlp_trie/dictionary.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-29 17:53
from abc import ABC, abstractmethod
from typing import List, Tuple, Any, Dict, Union, Sequence, Iterable, Optional

from hanlp_common.configurable import Configurable
from hanlp_common.reflection import classpath_of
from hanlp_trie.trie import Trie


class DictInterface(ABC):
    @abstractmethod
    def tokenize(self, text: Union[str, Sequence[str]]) -> List[Tuple[int, int, Any]]:
        """Implement this method to tokenize a piece of text into a list of non-intersect spans, each span is a tuple
        of ``(begin_offset, end_offset, label)``, where label is some properties related to this span and downstream
        tasks have the freedom to define what kind of labels they want.

        Args:
            text: The text to be tokenized.

        Returns:
              A list of tokens.

        """
        pass

    def split(self, text: Union[str, Sequence[str]]) -> List[Tuple[int, int, Any]]:
        """Like the :meth:`str.split`, this method splits a piece of text into chunks by taking the keys in this
        dictionary as delimiters. It performs longest-prefix-matching on text and split it whenever a longest key is
        matched. Unlike the :meth:`str.split`, it inserts matched keys into the results list right after where they are
        found. So that the text can be restored by joining chunks in the results list.

        Args:
            text: A piece of text.

        Returns:
            A list of chunks, each chunk is a span of ``(begin_offset, end_offset, label)``, where label is some
            properties related to this span and downstream tasks.
        """
        offset = 0
        spans = []
        for begin, end, label in self.tokenize(text):
            if begin > offset:
                spans.append(text[offset:begin])
            spans.append((begin, end, label))
            offset = end
        if offset < len(text):
            spans.append(text[offset:])
        return spans


class TrieDict(Trie, DictInterface, Configurable):
    def __init__(self, dictionary: Optional[Union[Dict[Iterable[str], Any], Iterable[str]]] = None) -> None:
        r"""
        A dict-like structure for fast custom dictionary strategies in tokenization and tagging. It is built with
        a dict of key-value pairs or a set of strings. When a set is passed in, it will be turned into a dict where each
        key is assigned with a boolean value ``True``.

        Args:
            dictionary: A custom dictionary of string-value pairs.
        """
        super().__init__(dictionary)

    def tokenize(self, text: Union[str, Sequence[str]]) -> List[Tuple[int, int, Any]]:
        return self.parse_longest(text)

    def split_batch(self, data: List[str]) -> Tuple[List[str], List[int], List[List[Tuple[int, int, Any]]]]:
        """ A handy method to perform longest-prefix-matching on a batch of sentences. It tokenize each sentence, record
        the chunks being either a key in the dict or a span outside of the dict. The spans are then packed into a new
        batch and returned along with the following information:

            - which sentence a span belongs to
            - the matched keys along with their spans and values.

        This method bridges the gap between statistical models and rule-based gazetteers.
        It's used in conjunction with :meth:`~hanlp_trie.dictionary.TrieDict.merge_batch`.

        Args:
            data: A batch of sentences.

        Returns:
            A tuple of the new batch, the belonging information and the keys.
        """
        new_data, new_data_belongs, parts = [], [], []
        for idx, sent in enumerate(data):
            parts.append([])
            found = self.tokenize(sent)
            if found:
                pre_start = 0
                for start, end, info in found:
                    if start > pre_start:
                        new_data.append(sent[pre_start:start])
                        new_data_belongs.append(idx)
                    pre_start = end
                    parts[idx].append((start, end, info))
                if pre_start != len(sent):
                    new_data.append(sent[pre_start:])
                    new_data_belongs.append(idx)
            else:
                new_data.append(sent)
                new_data_belongs.append(idx)
        return new_data, new_data_belongs, parts

    @staticmethod
    def merge_batch(data, new_outputs, new_data_belongs, parts):
        """ A helper method to merge the outputs of split batch back by concatenating the output per span with the key
        used to split it. It's used in conjunction with :meth:`~hanlp_trie.dictionary.TrieDict.split_batch`.

        Args:
            data: Split batch.
            new_outputs: Outputs of the split batch.
            new_data_belongs: Belonging information.
            parts: The keys.

        Returns:
            Merged outputs.
        """
        outputs = []
        segments = []
        for idx in range(len(data)):
            segments.append([])
        for o, b in zip(new_outputs, new_data_belongs):
            dst = segments[b]
            dst.append(o)
        for s, p, sent in zip(segments, parts, data):
            s: list = s
            if p:
                dst = []
                offset = 0
                for start, end, info in p:
                    while offset < start:
                        head = s.pop(0)
                        offset += sum(len(token) for token in head)
                        dst += head
                    if isinstance(info, list):
                        dst += info
                    elif isinstance(info, str):
                        dst.append(info)
                    else:
                        dst.append(sent[start:end])
                    offset = end
                if s:
                    assert len(s) == 1
                    dst += s[0]
                outputs.append(dst)
            else:
                outputs.append(s[0])
        return outputs

    @property
    def config(self):
        return {
            'classpath': classpath_of(self),
            'dictionary': dict(self.items())
        }


class TupleTrieDict(TrieDict):
    def __init__(self, dictionary: Optional[Union[Dict[Iterable[str], Any], Iterable[str]]] = None) -> None:
        r"""
        A dict-like structure for fast custom dictionary strategies in tokenization and tagging. It is built with
        a dict of key-value pairs or a set of strings. When a set is passed in, it will be turned into a dict where each
        key is assigned with a boolean value ``True``. In comparison to ``TrieDict``, ``TupleTrieDict`` additionally
        supports serializing/deserializing tuple-as-keys dict.

        Args:
            dictionary: A custom dictionary of string-value pairs.
        """
        if isinstance(dictionary, list) and dictionary and isinstance(dictionary[0], (list, tuple)):
            _d = dict()
            for k, v in dictionary:
                _d[tuple(k)] = v
            dictionary = _d
        super().__init__(dictionary)

    @property
    def config(self):
        return {
            'classpath': classpath_of(self),
            'dictionary': list(self.items(prefix=()))
        }

    def parse_longest(self, text: Sequence[str]) -> List[Tuple[int, int, Any]]:
        """Longest-prefix-matching which tries to match the longest keyword sequentially from the head of the text till
        its tail. By definition, the matches won't overlap with each other.

        Args:
            text: A piece of text. In HanLP's design, it doesn't really matter whether this is a str or a list of str.
                The trie will transit on either types properly, which means a list of str simply defines a list of
                transition criteria while a str defines each criterion as a character.

        Returns:
            A tuple of ``(begin, end, value)``.

        """
        found = []
        i = 0
        while i < len(text):
            state = self.transit(text[i:i + 1])
            if state:
                to = i + 1
                end = to
                value = state._value
                for to in range(i + 1, len(text)):
                    state = state.transit(text[to:to + 1])
                    if not state:
                        break
                    if state._value is not None:
                        value = state._value
                        end = to + 1
                if value is not None:
                    found.append((i, end, value))
                    i = end - 1
            i += 1
        return found


================================================
FILE: plugins/hanlp_trie/hanlp_trie/trie.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-04 23:46
from typing import Dict, Any, List, Tuple, Sequence, Union, Iterable, Optional


class Node(object):
    def __init__(self, value=None) -> None:
        """A node in a trie tree.

        Args:
            value: The value associated with this node.
        """
        self._children = {}
        self._value = value

    def _get_or_add_child(self, char):
        child = self._children.get(char)
        if child is None:
            child = Node(None)
            self._children[char] = child
        return child

    def transit(self, key):
        """Transit the state of a Deterministic Finite Automata (DFA) with key.

        Args:
            key: A sequence of criterion (tokens or characters) used to transit to a new state.

        Returns:
            A new state if the transition succeeded, otherwise ``None``.

        """
        state = self
        for char in key:
            state = state._children.get(char)
            if state is None:
                break
        return state

    def _walk(self, prefix: Union[str, tuple], ordered=False):
        for char, child in sorted(self._children.items()) if ordered else self._children.items():
            prefix_new = prefix + (char if isinstance(prefix, str) else (char,))
            if child._value:
                yield prefix_new, child._value
            yield from child._walk(prefix_new)


class Trie(Node):
    def __init__(self, tokens: Optional[Union[Dict[str, Any], Iterable[str]]] = None) -> None:
        """A referential implementation of the trie (:cite:`10.1145/1457838.1457895`) structure. It stores a dict by
        assigning each key/value pair a :class:`~hanlp_trie.trie.Node` in a trie tree. It provides get/set/del/items
        methods just like a :class:`dict` does. Additionally, it also provides longest-prefix-matching and keywords
        lookup against a piece of text, which are very helpful in rule-based Natural Language Processing.

        Args:
            tokens: A set of keys or a dict mapping.
        """
        super().__init__()
        self._size = 0
        if tokens:
            if isinstance(tokens, dict):
                for k, v in tokens.items():
                    self[k] = v
            else:
                for k in tokens:
                    self[k] = True

    def __contains__(self, key):
        return self[key] is not None

    def __getitem__(self, key):
        state = self.transit(key)
        if state is None:
            return None
        return state._value

    def __setitem__(self, key, value):
        state = self
        for char in key[:-1]:
            state = state._get_or_add_child(char)

        leaf = state._get_or_add_child(key[-1])
        if leaf._value is None:
            self._size += 1
        leaf._value = value

    def __delitem__(self, key):
        state = self.transit(key)
        if state is not None:
            state._value = None
            self._size -= 1

    def update(self, dic: Dict[str, Any]):
        for k, v in dic.items():
            self[k] = v
        return self

    def parse(self, text: Sequence[str]) -> List[Tuple[int, int, Any]]:
        """Keywords lookup which takes a piece of text as input, and lookup all occurrences of keywords in it. These
        occurrences can overlap with each other.

        Args:
            text: A piece of text. In HanLP's design, it doesn't really matter whether this is a str or a list of str.
                The trie will transit on either types properly, which means a list of str simply defines a list of
                transition criteria while a str defines each criterion as a character.

        Returns:
            A tuple of ``(begin, end, value)``.
        """
        found = []
        for i in range(len(text)):
            state = self
            for j in range(i, len(text)):
                state = state.transit(text[j])
                if state:
                    if state._value is not None:
                        found.append((i, j + 1, state._value))
                else:
                    break
        return found

    def parse_longest(self, text: Sequence[str]) -> List[Tuple[int, int, Any]]:
        """Longest-prefix-matching which tries to match the longest keyword sequentially from the head of the text till
        its tail. By definition, the matches won't overlap with each other.

        Args:
            text: A piece of text. In HanLP's design, it doesn't really matter whether this is a str or a list of str.
                The trie will transit on either types properly, which means a list of str simply defines a list of
                transition criteria while a str defines each criterion as a character.

        Returns:
            A tuple of ``(begin, end, value)``.

        """
        found = []
        i = 0
        while i < len(text):
            state = self.transit(text[i])
            if state:
                to = i + 1
                end = to
                value = state._value
                for to in range(i + 1, len(text)):
                    state = state.transit(text[to])
                    if not state:
                        break
                    if state._value is not None:
                        value = state._value
                        end = to + 1
                if value is not None:
                    found.append((i, end, value))
                    i = end - 1
            i += 1
        return found

    def items(self, ordered=False, prefix=''):
        yield from self._walk(prefix, ordered)

    def __len__(self):
        return self._size

    def __bool__(self):
        return bool(len(self))


================================================
FILE: plugins/hanlp_trie/setup.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 19:26
from os.path import abspath, join, dirname
from setuptools import find_packages, setup

this_dir = abspath(dirname(__file__))
with open(join(this_dir, 'README.md'), encoding='utf-8') as file:
    long_description = file.read()

setup(
    name='hanlp_trie',
    version='0.0.5',
    description='HanLP: Han Language Processing',
    long_description=long_description,
    long_description_content_type="text/markdown",
    url='https://github.com/hankcs/HanLP',
    author='hankcs',
    author_email='hankcshe@gmail.com',
    license='Apache License 2.0',
    classifiers=[
        'Intended Audience :: Science/Research',
        'Intended Audience :: Developers',
        "Development Status :: 3 - Alpha",
        'Operating System :: OS Independent',
        "License :: OSI Approved :: Apache Software License",
        'Programming Language :: Python :: 3 :: Only',
        'Topic :: Scientific/Engineering :: Artificial Intelligence',
        "Topic :: Text Processing :: Linguistic"
    ],
    keywords='corpus,machine-learning,NLU,NLP',
    packages=find_packages(exclude=['docs', 'tests*']),
    include_package_data=True,
    install_requires=[
        'hanlp_common'
    ],
    python_requires='>=3.6',
)


================================================
FILE: plugins/hanlp_trie/tests/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-29 18:05


================================================
FILE: plugins/hanlp_trie/tests/test_trie.py
================================================
import unittest

from hanlp_trie import Trie


class TestTrie(unittest.TestCase):
    def build_small_trie(self):
        return Trie({'商品': 'goods', '和': 'and', '和服': 'kimono', '服务': 'service', '务': 'business'})

    def assert_results_valid(self, text, results, trie):
        for begin, end, value in results:
            self.assertEqual(value, trie[text[begin:end]])

    def test_parse(self):
        trie = self.build_small_trie()
        text = '商品和服务'
        parse_result = trie.parse(text)
        self.assert_results_valid(text, parse_result, trie)
        self.assertEqual([(0, 2, 'goods'),
                          (2, 3, 'and'),
                          (2, 4, 'kimono'),
                          (3, 5, 'service'),
                          (4, 5, 'business')],
                         parse_result)

    def test_parse_longest(self):
        trie = self.build_small_trie()
        text = '商品和服务'
        parse_longest_result = trie.parse_longest(text)
        self.assert_results_valid(text, parse_longest_result, trie)
        self.assertEqual([(0, 2, 'goods'), (2, 4, 'kimono'), (4, 5, 'business')],
                         parse_longest_result)

    def test_items(self):
        trie = self.build_small_trie()
        items = list(trie.items())
        self.assertEqual([('商品', 'goods'), ('和', 'and'), ('和服', 'kimono'), ('服务', 'service'), ('务', 'business')], items)

    def test_len(self):
        trie = self.build_small_trie()
        self.assertEqual(len(trie), 5)
        trie['和'] = '&'
        self.assertEqual(len(trie), 5)
        del trie['和']
        self.assertEqual(len(trie), 4)
        trie['和'] = '&'
        self.assertEqual(len(trie), 5)


if __name__ == '__main__':
    unittest.main()


================================================
FILE: plugins/hanlp_trie/tests/test_trie_dict.py
================================================
import unittest

from hanlp_trie import TrieDict


class TestTrieDict(unittest.TestCase):

    def setUp(self) -> None:
        super().setUp()
        self.text = '第一个词语很重要，第二个词语也很重要'
        self.trie_dict = TrieDict({'重要': 'important'})

    def test_tokenize(self):
        self.assertEqual([(6, 8, 'important'), (16, 18, 'important')], self.trie_dict.tokenize(self.text))

    def test_split_batch(self):
        data = [self.text]
        new_data, new_data_belongs, parts = self.trie_dict.split_batch(data)
        predictions = [list(x) for x in new_data]
        self.assertSequenceEqual(
            [['第', '一', '个', '词', '语', '很', 'important', '，', '第', '二', '个', '词', '语', '也', '很', 'important']],
            self.trie_dict.merge_batch(data, predictions, new_data_belongs, parts))

    def test_tokenize_2(self):
        t = TrieDict({'次世代', '生产环境'})
        self.assertSequenceEqual(t.tokenize('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'),
                                 [(15, 19, True), (21, 24, True)])

    def test_empty_dict(self):
        trie_dict = TrieDict()
        self.assertFalse(bool(trie_dict))
        trie_dict['one'] = 1
        self.assertTrue(bool(trie_dict))
        del trie_dict['one']
        self.assertFalse(bool(trie_dict))


if __name__ == '__main__':
    unittest.main()


================================================
FILE: setup.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 19:26
import sys
from os.path import abspath, join, dirname
from setuptools import find_packages, setup

this_dir = abspath(dirname(__file__))
with open(join(this_dir, 'README.md'), encoding='utf-8') as file:
    long_description = file.read()
version = {}
with open(join(this_dir, "hanlp", "version.py")) as fp:
    exec(fp.read(), version)

FASTTEXT = 'fasttext-wheel==0.9.2'
sys_version_info = sys.version_info

TOKENIZERS = []
if (sys_version_info.major, sys_version_info.minor) == (3, 6) and sys.platform in {'darwin', 'win32'}:
    TOKENIZERS = ['tokenizers==0.10.3']

extras_require = {
    'amr': [
        'penman==1.2.1',
        'networkx>=2.5.1',
        'perin-parser>=0.0.12',
    ],
    'fasttext': [FASTTEXT],
    'tf': [FASTTEXT, 'tensorflow>=2.6.0,<2.14']
}
extras_require['full'] = list(set(sum(extras_require.values(), [])))

setup(
    name='hanlp',
    version=version['__version__'],
    description='HanLP: Han Language Processing',
    long_description=long_description,
    long_description_content_type="text/markdown",
    url='https://github.com/hankcs/HanLP',
    author='hankcs',
    author_email='hankcshe@gmail.com',
    license='Apache License 2.0',
    classifiers=[
        'Intended Audience :: Science/Research',
        'Intended Audience :: Developers',
        "Development Status :: 4 - Beta",
        'Operating System :: OS Independent',
        "License :: OSI Approved :: Apache Software License",
        'Programming Language :: Python :: 3.6',
        'Programming Language :: Python :: 3.7',
        'Programming Language :: Python :: 3.8',
        'Programming Language :: Python :: 3.9',
        'Programming Language :: Python :: 3.10',
        'Topic :: Scientific/Engineering :: Artificial Intelligence',
        "Topic :: Text Processing :: Linguistic"
    ],
    keywords='corpus,machine-learning,NLU,NLP',
    packages=find_packages(exclude=['docs', 'tests*']),
    include_package_data=True,
    install_requires=[
        'termcolor',
        'pynvml',
        'toposort==1.5',
        'transformers>=4.1.1',
        'sentencepiece>=0.1.91',  # Essential for tokenization_bert_japanese
        'torch>=1.6.0',
        'hanlp-common>=0.0.22',
        'hanlp-trie>=0.0.4',
        'hanlp-downloader',
        *TOKENIZERS,
    ],
    extras_require=extras_require,
    python_requires='>=3.6',
)


================================================
FILE: tests/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-06-13 23:43
import os

root = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))


def cdroot():
    """
    cd to project root, so models are saved in the root folder
    """
    os.chdir(root)


================================================
FILE: tests/test_config_tracker.py
================================================
import unittest

from hanlp.common.structure import ConfigTracker


class MyClass(ConfigTracker):
    def __init__(self, i_need_this='yes') -> None:
        super().__init__(locals())


class TestConfigTracker(unittest.TestCase):
    def test_init(self):
        obj = MyClass()
        self.assertEqual(obj.config.get('i_need_this', None), 'yes')


if __name__ == '__main__':
    unittest.main()


================================================
FILE: tests/test_mtl.py
================================================
import hanlp
import unittest
from multiprocessing.dummy import Pool
from hanlp_common.document import Document

mtl = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH, devices=-1)


def tokenize(mtl, text):
    return mtl(text, tasks='tok/fine')['tok/fine']


class TestMultiTaskLearning(unittest.TestCase):
    def test_mtl_single_sent(self):
        doc: Document = mtl('商品和服务')
        self.assertSequenceEqual(doc['tok/fine'], ["商品", "和", "服务"])

    def test_mtl_multiple_sents(self):
        doc: Document = mtl(['商品和服务', '研究生命'])
        self.assertSequenceEqual(doc['tok/fine'], [
            ["商品", "和", "服务"],
            ["研究", "生命"]
        ])

    def test_mtl_empty_str(self):
        mtl('')
        mtl(' ')
        mtl([''])
        mtl([' '])
        mtl(['', ' '])
        mtl(['', ' ', 'good'])
        mtl([[]], skip_tasks='tok*')

    def test_skip_tok(self):
        pre_tokenized_sents = [
            ["商品和服务", '一个', '词'],
            ["研究", "生命"]
        ]
        doc: Document = mtl(pre_tokenized_sents, skip_tasks='tok*')
        self.assertSequenceEqual(doc['tok'], pre_tokenized_sents)

    def test_sdp_as_the_first_task(self):
        doc: Document = mtl(['人', '吃', '鱼'], tasks='sdp', skip_tasks='tok*')
        self.assertDictEqual(
            doc.to_dict(),
            {
                "sdp": [
                    [(2, "Agt")],
                    [(0, "Root")],
                    [(2, "Pat")]
                ],
                "tok": [
                    "人",
                    "吃",
                    "鱼"
                ]
            }
        )

    def test_threading(self):
        num_proc = 8
        with Pool(num_proc) as pool:
            results = pool.starmap(tokenize, [(mtl, '商品和服务')] * num_proc)
            self.assertSequenceEqual(results, [['商品', '和', '服务']] * num_proc)

    def test_emoji(self):
        self.assertSequenceEqual(mtl('( ͡° ͜ʖ ͡ °)你好', tasks='tok/fine')['tok/fine'],
                                 ["(", " ͡", "°", " ͜", "ʖ", " ͡ ", "°", ")", "你", "好"])
        mtl['tok/fine'].dict_combine = {'( ͡° ͜ʖ ͡ °)'}
        self.assertSequenceEqual(mtl('( ͡° ͜ʖ ͡ °)你好', tasks='tok/fine')['tok/fine'],
                                 ["( ͡° ͜ʖ ͡ °)", "你", "好"])

    def test_unicode_removed_by_hf(self):
        self.assertSequenceEqual(mtl('͡', tasks='tok/fine')['tok/fine'], ['͡'])

    def test_space(self):
        task = 'tok/fine'
        doc: Document = mtl('商品 和服务', tasks=task)
        self.assertSequenceEqual(doc[task], ["商品", "和", "服务"])
        mtl[task].dict_combine = {('iPad', 'Pro'), '2个空格'}
        self.assertSequenceEqual(mtl("如何评价iPad Pro ？iPad  Pro有2个空格", tasks=task)[task],
                                 ['如何', '评价', 'iPad Pro', '？', 'iPad  Pro', '有', '2个空格'])

    def test_transform(self):
        task = 'tok/fine'
        mtl[task].dict_force = {'用户ID'}
        self.assertSequenceEqual(mtl("我的用户ID跟你的用户id不同", tasks=task)[task],
                                 ['我', '的', '用户ID', '跟', '你', '的', '用户', 'id', '不同'])

    def test_tok_offset(self):
        task = 'tok/fine'
        tok = mtl[task]
        tok.config.output_spans = True
        tok.dict_force = None
        tok.dict_combine = None
        sent = '我先去看医生'

        for t, b, e in mtl(sent, tasks=task)[task]:
            self.assertEqual(t, sent[b:e])

        tok.dict_combine = {'先去'}
        for t, b, e in mtl(sent, tasks=task)[task]:
            self.assertEqual(t, sent[b:e])

        tok.config.output_spans = False
        tok.dict_force = None
        tok.dict_combine = None


if __name__ == '__main__':
    unittest.main()


================================================
FILE: tests/test_pipeline.py
================================================
import unittest
import hanlp


class TestPipeLine(unittest.TestCase):
    def test_copy(self):
        pipe = hanlp.pipeline().append(hanlp.utils.rules.split_sentence)
        copied_pipe = pipe.copy()
        test_text = "今天天气真好。我要去散步。"
        assert pipe is not copied_pipe
        copied_pipe.append(lambda sent: "".join(sent))
        assert pipe(test_text) != copied_pipe(test_text)

if __name__ == '__main__':
    unittest.main()


================================================
FILE: tests/test_rules.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-03-22 17:17
import unittest

from hanlp.utils.rules import split_sentence


class TestRules(unittest.TestCase):
    def test_eos(self):
        self.assertListEqual(list(split_sentence('叶')), ['叶'])
        self.assertListEqual(list(split_sentence('他说：“加油。”谢谢')), ['他说：“加油。”', '谢谢'])
        self.assertListEqual(list(split_sentence('Go to hankcs.com. Yes.')), ['Go to hankcs.com.', 'Yes.'])


if __name__ == '__main__':
    unittest.main()


================================================
FILE: tests/test_string_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-03-22 17:17
import unittest

from hanlp.utils.string_util import possible_tokenization


class TestStringUtility(unittest.TestCase):
    def test_enumerate_tokenization(self):
        text = '商品和服务'
        toks = possible_tokenization(text)
        assert len(set(toks)) == 2 ** (len(text) - 1)
        for each in toks:
            assert ''.join(each) == text


if __name__ == '__main__':
    unittest.main()